Spaces:
Running
Running
ui 2
Browse files
app.py
CHANGED
@@ -45,14 +45,15 @@ SPEAKER_CONFIG_METHODS = [
|
|
45 |
"Single Voice (Global)",
|
46 |
"Random per Speaker",
|
47 |
"A/B Round Robin",
|
48 |
-
"Detailed Configuration (Table)" #
|
49 |
]
|
50 |
-
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
51 |
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
52 |
-
_speaker_config_cache = {}
|
53 |
|
54 |
# --- Speaker Configuration Table Logic ---
|
55 |
DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
|
|
|
56 |
DATAFRAME_COL_SPEAKER = 0
|
57 |
DATAFRAME_COL_VOICE = 1
|
58 |
DATAFRAME_COL_SPEED = 2
|
@@ -62,29 +63,45 @@ def get_speakers_from_script(script_text):
|
|
62 |
if not script_text.strip():
|
63 |
return []
|
64 |
try:
|
65 |
-
# Use a simplified parsing just for speaker names if full parsing is too heavy here
|
66 |
-
# For now, leverage existing robust parser.
|
67 |
parsed_lines, _ = parse_dialogue_script(script_text)
|
68 |
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
69 |
return unique_speakers
|
70 |
-
except ValueError:
|
71 |
-
return []
|
72 |
|
73 |
-
def convert_df_to_config_map(
|
74 |
-
"""Converts DataFrame data (list of lists) to the internal config map."""
|
75 |
config_map = {}
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
|
83 |
instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
|
84 |
|
85 |
-
|
|
|
|
|
86 |
if voice not in APP_AVAILABLE_VOICES:
|
87 |
-
print(f"Warning: Invalid voice '{voice}' for
|
88 |
voice = APP_AVAILABLE_VOICES[0]
|
89 |
|
90 |
cfg = {"voice": voice}
|
@@ -92,85 +109,66 @@ def convert_df_to_config_map(df_data):
|
|
92 |
if speed_str:
|
93 |
try:
|
94 |
speed_val = float(speed_str)
|
95 |
-
if 0.25 <= speed_val <= 4.0:
|
96 |
-
|
97 |
-
|
98 |
-
print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range (0.25-4.0). Ignoring.")
|
99 |
-
except ValueError:
|
100 |
-
print(f"Warning: Invalid speed value '{speed_str}' for '{speaker_name}'. Ignoring.")
|
101 |
|
102 |
-
if instructions:
|
103 |
-
cfg["instructions"] = instructions
|
104 |
|
105 |
config_map[speaker_name] = cfg
|
106 |
return config_map
|
107 |
|
108 |
-
def load_speakers_for_table(script_text,
|
109 |
-
"""
|
110 |
-
Populates a DataFrame structure based on speakers in the script.
|
111 |
-
Tries to preserve existing settings from current_df_data_obj.
|
112 |
-
current_df_data_obj is the raw object from gr.DataFrame, like {'data': [...], 'headers': ...}
|
113 |
-
"""
|
114 |
current_config_map = {}
|
115 |
-
if
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
if len(row) >= len(DATAFRAME_HEADERS):
|
120 |
-
speaker = row[DATAFRAME_COL_SPEAKER]
|
121 |
current_config_map[speaker] = {
|
122 |
-
"voice": row[DATAFRAME_COL_VOICE],
|
123 |
"speed": str(row[DATAFRAME_COL_SPEED]),
|
124 |
"instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
|
125 |
}
|
126 |
-
else:
|
127 |
-
print(f"Skipping malformed row in current DataFrame data: {row}")
|
128 |
|
129 |
|
130 |
unique_speakers = get_speakers_from_script(script_text)
|
131 |
new_df_rows = []
|
132 |
-
|
133 |
-
default_voice = APP_AVAILABLE_VOICES[0]
|
134 |
-
default_speed_str = "1.0"
|
135 |
-
default_instructions_str = ""
|
136 |
|
137 |
for speaker in unique_speakers:
|
138 |
-
|
139 |
new_df_rows.append([
|
140 |
speaker,
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
])
|
145 |
|
146 |
-
|
147 |
-
|
148 |
-
instr_info = " (for gpt-4o-mini-tts)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A for this model)"
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
f"{DATAFRAME_HEADERS[2]}{speed_info}",
|
154 |
-
f"{DATAFRAME_HEADERS[3]}{instr_info}",
|
155 |
-
]
|
156 |
-
|
157 |
-
# If no speakers, provide a dummy row to keep the DataFrame component from erroring/disappearing if it needs data
|
158 |
-
if not new_df_rows:
|
159 |
-
new_df_rows.append(["(No speakers detected)", default_voice, default_speed_str, default_instructions_str])
|
160 |
-
|
161 |
-
# Return a new DataFrame object compatible with gr.DataFrame.update
|
162 |
-
# This structure for gr.DataFrame update might be df_obj or pd.DataFrame(new_df_rows, columns=...)
|
163 |
-
# Let's try returning a pandas DataFrame
|
164 |
-
return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=headers_with_model_info)
|
165 |
|
|
|
|
|
|
|
|
|
166 |
|
167 |
-
# --- Existing parsing and config logic (slightly adapted if needed) ---
|
168 |
-
def parse_detailed_speaker_config(json_text, parsed_script_lines): # Kept for reference or other uses, but not for Table UI
|
169 |
-
# ... (implementation from previous version for JSON parsing)
|
170 |
-
pass # For now, this function is not directly used by the Table UI.
|
171 |
|
172 |
def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
|
173 |
-
global_selected_voice,
|
174 |
global _speaker_config_cache
|
175 |
if _speaker_config_cache.get("__method") != speaker_config_method or \
|
176 |
_speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
|
@@ -182,23 +180,21 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
|
|
182 |
base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
|
183 |
return base_config
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
return { # Ensure all keys exist, falling back to None if not in map
|
190 |
"voice": speaker_specific.get("voice", base_config["voice"]),
|
191 |
"speed": speaker_specific.get("speed"),
|
192 |
"instructions": speaker_specific.get("instructions")
|
193 |
}
|
194 |
else:
|
195 |
-
print(f"Warning: Speaker '{speaker_name}' not
|
196 |
return base_config
|
197 |
|
198 |
-
# For "Random" and "A/B"
|
199 |
if speaker_name not in _speaker_config_cache:
|
200 |
if speaker_config_method == "Random per Speaker":
|
201 |
-
# ... (same logic as before) ...
|
202 |
available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
|
203 |
if not _speaker_config_cache.get("__all_assigned_random"):
|
204 |
for i, spk_unique in enumerate(unique_script_speakers):
|
@@ -209,7 +205,6 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
|
|
209 |
_speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
|
210 |
|
211 |
elif speaker_config_method == "A/B Round Robin":
|
212 |
-
# ... (same logic as before) ...
|
213 |
if not _speaker_config_cache.get("__all_assigned_ab"):
|
214 |
for i, spk_unique in enumerate(unique_script_speakers):
|
215 |
if spk_unique not in _speaker_config_cache:
|
@@ -228,140 +223,123 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
|
|
228 |
async def handle_script_processing(
|
229 |
dialogue_script: str, tts_model: str, pause_ms: int,
|
230 |
speaker_config_method: str, global_voice_selection: str,
|
231 |
-
speaker_details_df_value: dict, #
|
232 |
global_speed: float,
|
233 |
global_instructions: str, progress=gr.Progress(track_tqdm=True)):
|
234 |
-
global _speaker_config_cache
|
235 |
-
_speaker_config_cache = {}
|
236 |
|
237 |
-
if not OPENAI_API_KEY or not async_openai_client:
|
238 |
-
|
239 |
-
if not dialogue_script.strip():
|
240 |
-
return None, None, "Error: Script is empty."
|
241 |
|
242 |
job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
|
243 |
if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
|
244 |
os.makedirs(job_audio_path_prefix, exist_ok=True)
|
245 |
|
246 |
try:
|
247 |
-
parsed_lines,
|
248 |
-
if not parsed_lines: return None, None, "Error: No valid
|
249 |
-
except ValueError as e: return None, None, f"Script
|
250 |
|
251 |
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
252 |
|
253 |
-
# This is where we get the detailed config map
|
254 |
parsed_detailed_config_map = {}
|
255 |
if speaker_config_method == "Detailed Configuration (Table)":
|
256 |
try:
|
257 |
parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
|
258 |
-
except Exception as e:
|
259 |
-
return None, None, f"Error processing Speaker Table: {str(e)}"
|
260 |
-
# (If you had a JSON option, you'd parse JSON here too)
|
261 |
|
262 |
tasks, line_audio_files = [], [None] * len(parsed_lines)
|
263 |
for i, line_data in enumerate(parsed_lines):
|
264 |
speaker_name = line_data["speaker"]
|
265 |
speaker_base_cfg = get_config_for_speaker(
|
266 |
speaker_name, speaker_config_method, unique_speakers,
|
267 |
-
global_voice_selection, parsed_detailed_config_map
|
268 |
)
|
269 |
line_voice = speaker_base_cfg["voice"]
|
270 |
|
271 |
-
|
272 |
-
if speaker_base_cfg.get("speed") is not None:
|
273 |
|
274 |
-
|
275 |
if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
|
276 |
-
|
277 |
|
278 |
-
|
279 |
-
progress(i / len(parsed_lines), desc=f"
|
280 |
tasks.append(synthesize_speech_line(
|
281 |
client=async_openai_client, text=line_data["text"], voice=line_voice,
|
282 |
-
output_path=
|
283 |
-
instructions=
|
284 |
line_index=line_data['id']))
|
285 |
|
286 |
-
|
287 |
-
for idx,
|
288 |
-
if isinstance(
|
289 |
-
elif
|
290 |
-
else: line_audio_files[idx] =
|
291 |
|
292 |
-
|
293 |
-
if not
|
294 |
-
shutil.rmtree(job_audio_path_prefix)
|
295 |
-
return None, None, "Error: No audio files successfully synthesized."
|
296 |
|
297 |
-
|
298 |
-
with zipfile.ZipFile(
|
299 |
-
for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
|
300 |
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
status = f"
|
305 |
-
if len(
|
306 |
-
if not
|
307 |
-
elif not
|
308 |
-
else: status += "
|
309 |
|
310 |
-
return (
|
311 |
-
|
312 |
status)
|
313 |
|
314 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
315 |
-
|
316 |
-
if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
|
317 |
try:
|
318 |
parsed, chars = parse_dialogue_script(dialogue_script)
|
319 |
-
if not parsed: return "Cost: $0.
|
320 |
cost = calculate_cost(chars, len(parsed), tts_model)
|
321 |
-
return f"Est. Cost: ${cost:.6f}
|
322 |
-
except ValueError as e: return f"Error: {str(e)}"
|
323 |
except Exception as e: return f"Cost calc error: {str(e)}"
|
324 |
|
325 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
326 |
-
gr.Markdown("# Dialogue Script to Speech
|
327 |
-
# ... (intro markdown same) ...
|
328 |
if not OPENAI_API_KEY or not async_openai_client:
|
329 |
-
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set
|
330 |
|
331 |
with gr.Row():
|
332 |
with gr.Column(scale=2):
|
333 |
-
script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10
|
334 |
with gr.Column(scale=1):
|
335 |
-
tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT
|
336 |
-
pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50
|
337 |
-
global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed
|
338 |
-
global_instructions_input = gr.Textbox(label="Global Instructions
|
339 |
|
340 |
gr.Markdown("### Speaker Configuration")
|
341 |
with gr.Row():
|
342 |
speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
|
343 |
-
global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice
|
344 |
-
|
345 |
-
# --- New UI for Detailed Table Configuration ---
|
346 |
-
current_visibility_detailed_table = (DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
|
|
|
|
352 |
speaker_details_df = gr.DataFrame(
|
353 |
-
label="Per-Speaker Settings
|
354 |
-
headers=DATAFRAME_HEADERS,
|
355 |
-
|
356 |
-
value=pd.DataFrame([["(Load speakers first)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
|
357 |
interactive=True,
|
358 |
-
|
359 |
-
|
360 |
-
# For now, all will be string input in table, conversion happens in backend.
|
361 |
-
# Consider making "Speaker Name" column read-only if possible with Gradio versions
|
362 |
)
|
363 |
-
|
364 |
-
# --- End New UI ---
|
365 |
|
366 |
with gr.Row():
|
367 |
calculate_cost_button = gr.Button("Calculate Cost")
|
@@ -371,38 +349,44 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
371 |
with gr.Row():
|
372 |
individual_lines_zip_output = gr.File(label="Download ZIP")
|
373 |
merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
|
374 |
-
status_output = gr.Textbox(label="Status", interactive=False, lines=
|
375 |
|
376 |
-
|
377 |
-
def update_model_controls(selected_model):
|
378 |
-
# ... (same as before) ...
|
379 |
is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
|
380 |
is_gpt_mini = selected_model == "gpt-4o-mini-tts"
|
381 |
-
|
382 |
-
speed_info = " (
|
383 |
-
instr_info = " (
|
384 |
-
|
385 |
-
|
|
|
386 |
return {
|
387 |
global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
|
388 |
global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
|
389 |
-
speaker_details_df: gr.update(headers=
|
390 |
}
|
391 |
-
tts_model_dropdown.change(
|
|
|
|
|
|
|
|
|
392 |
|
393 |
-
def
|
394 |
is_single = (method == "Single Voice (Global)")
|
395 |
is_detailed_table = (method == "Detailed Configuration (Table)")
|
396 |
return {
|
397 |
global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
|
398 |
-
detailed_table_config_group: gr.update(visible=is_detailed_table),
|
399 |
}
|
400 |
-
speaker_config_method_dropdown.change(
|
|
|
|
|
|
|
|
|
401 |
|
402 |
-
# Connect Load Speakers button
|
403 |
load_speakers_button.click(
|
404 |
fn=load_speakers_for_table,
|
405 |
-
inputs=[script_input, speaker_details_df, tts_model_dropdown],
|
406 |
outputs=[speaker_details_df]
|
407 |
)
|
408 |
|
@@ -410,29 +394,21 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
410 |
generate_button.click(
|
411 |
fn=handle_script_processing,
|
412 |
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
|
413 |
-
global_voice_dropdown,
|
414 |
-
speaker_details_df, # Changed from detailed_speaker_config_input (JSON)
|
415 |
global_speed_input, global_instructions_input],
|
416 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
|
417 |
|
418 |
gr.Markdown("## Examples")
|
419 |
-
# Examples will need to be adjusted if the "Detailed Configuration (Table)" is default and needs pre-filled table
|
420 |
-
# For simplicity, examples might focus on other config methods or a very simple table.
|
421 |
-
# For "Detailed Configuration (Table)", the flow is: type script -> click "Load Speakers" -> edit table -> generate.
|
422 |
-
# So, an example would just set the script and method. The table data itself isn't directly an input to an example click.
|
423 |
-
# We can, however, provide an example script.
|
424 |
gr.Examples(
|
425 |
examples=[
|
426 |
-
["[Alice]
|
427 |
-
["[Narrator]
|
428 |
-
["[Random1] Voice 1\n[Random2] Voice 2", "gpt-4o-mini-tts", 400, "Random per Speaker", APP_AVAILABLE_VOICES[0], None, 1.0, "Speak clearly."]
|
429 |
],
|
430 |
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
|
431 |
-
speaker_details_df,
|
432 |
-
global_speed_input, global_instructions_input],
|
433 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
434 |
fn=handle_script_processing, cache_examples=False)
|
435 |
|
436 |
if __name__ == "__main__":
|
437 |
if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
438 |
-
demo.launch(debug=True)
|
|
|
45 |
"Single Voice (Global)",
|
46 |
"Random per Speaker",
|
47 |
"A/B Round Robin",
|
48 |
+
"Detailed Configuration (Table)" # Using Table method
|
49 |
]
|
50 |
+
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
51 |
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
52 |
+
_speaker_config_cache = {}
|
53 |
|
54 |
# --- Speaker Configuration Table Logic ---
|
55 |
DATAFRAME_HEADERS = ["Speaker Name", "Voice", "Speed (0.25-4.0)", "Instructions (Text)"]
|
56 |
+
# Column indices for clarity (optional, but can help maintainability)
|
57 |
DATAFRAME_COL_SPEAKER = 0
|
58 |
DATAFRAME_COL_VOICE = 1
|
59 |
DATAFRAME_COL_SPEED = 2
|
|
|
63 |
if not script_text.strip():
|
64 |
return []
|
65 |
try:
|
|
|
|
|
66 |
parsed_lines, _ = parse_dialogue_script(script_text)
|
67 |
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
68 |
return unique_speakers
|
69 |
+
except ValueError:
|
70 |
+
return []
|
71 |
|
72 |
+
def convert_df_to_config_map(df_data_obj):
|
|
|
73 |
config_map = {}
|
74 |
+
# gr.DataFrame's value is a dict like {'data': [[row1_val1, row1_val2], ...], 'headers': [...]}
|
75 |
+
# or if constructed with pandas, it might be a pandas DataFrame directly.
|
76 |
+
# Let's handle if it's the dict structure.
|
77 |
+
|
78 |
+
# Ensure df_data_obj is not None and has 'data'
|
79 |
+
if not df_data_obj or 'data' not in df_data_obj or not isinstance(df_data_obj['data'], list):
|
80 |
+
if isinstance(df_data_obj, pd.DataFrame): # If it's already a pandas DF
|
81 |
+
data_list = df_data_obj.values.tolist()
|
82 |
+
else: # Unrecognized format or empty
|
83 |
+
print("DataFrame data is empty or in an unrecognized format for conversion.")
|
84 |
+
return config_map
|
85 |
+
else:
|
86 |
+
data_list = df_data_obj['data']
|
87 |
+
|
88 |
+
|
89 |
+
for row in data_list:
|
90 |
+
# Defensive check for row length, assuming DATAFRAME_HEADERS length
|
91 |
+
if len(row) < len(DATAFRAME_HEADERS):
|
92 |
+
print(f"Skipping malformed row in DataFrame data during conversion: {row}")
|
93 |
+
continue
|
94 |
+
|
95 |
+
speaker_name = str(row[DATAFRAME_COL_SPEAKER]).strip()
|
96 |
+
voice = str(row[DATAFRAME_COL_VOICE]).strip()
|
97 |
speed_str = str(row[DATAFRAME_COL_SPEED]).strip()
|
98 |
instructions = str(row[DATAFRAME_COL_INSTRUCTIONS]).strip()
|
99 |
|
100 |
+
if not speaker_name or speaker_name == "(Load speakers first)" or speaker_name == "(No speakers detected)":
|
101 |
+
continue # Skip placeholder/empty speaker names
|
102 |
+
|
103 |
if voice not in APP_AVAILABLE_VOICES:
|
104 |
+
print(f"Warning: Invalid voice '{voice}' for '{speaker_name}'. Using '{APP_AVAILABLE_VOICES[0]}'.")
|
105 |
voice = APP_AVAILABLE_VOICES[0]
|
106 |
|
107 |
cfg = {"voice": voice}
|
|
|
109 |
if speed_str:
|
110 |
try:
|
111 |
speed_val = float(speed_str)
|
112 |
+
if 0.25 <= speed_val <= 4.0: cfg["speed"] = speed_val
|
113 |
+
else: print(f"Warning: Speed '{speed_val}' for '{speaker_name}' out of range. Ignoring.")
|
114 |
+
except ValueError: print(f"Warning: Invalid speed '{speed_str}' for '{speaker_name}'. Ignoring.")
|
|
|
|
|
|
|
115 |
|
116 |
+
if instructions: cfg["instructions"] = instructions
|
|
|
117 |
|
118 |
config_map[speaker_name] = cfg
|
119 |
return config_map
|
120 |
|
121 |
+
def load_speakers_for_table(script_text, current_df_obj, selected_tts_model):
|
|
|
|
|
|
|
|
|
|
|
122 |
current_config_map = {}
|
123 |
+
if isinstance(current_df_obj, pd.DataFrame) and not current_df_obj.empty:
|
124 |
+
for _, row_series in current_df_obj.iterrows():
|
125 |
+
row = row_series.tolist() # Convert Series to list
|
126 |
+
if len(row) >= len(DATAFRAME_HEADERS):
|
127 |
+
speaker = str(row[DATAFRAME_COL_SPEAKER])
|
128 |
+
current_config_map[speaker] = {
|
129 |
+
"voice": str(row[DATAFRAME_COL_VOICE]),
|
130 |
+
"speed": str(row[DATAFRAME_COL_SPEED]),
|
131 |
+
"instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
|
132 |
+
}
|
133 |
+
elif isinstance(current_df_obj, dict) and current_df_obj.get('data'): # Handle raw dict from Gradio
|
134 |
+
for row in current_df_obj['data']:
|
135 |
if len(row) >= len(DATAFRAME_HEADERS):
|
136 |
+
speaker = str(row[DATAFRAME_COL_SPEAKER])
|
137 |
current_config_map[speaker] = {
|
138 |
+
"voice": str(row[DATAFRAME_COL_VOICE]),
|
139 |
"speed": str(row[DATAFRAME_COL_SPEED]),
|
140 |
"instructions": str(row[DATAFRAME_COL_INSTRUCTIONS])
|
141 |
}
|
|
|
|
|
142 |
|
143 |
|
144 |
unique_speakers = get_speakers_from_script(script_text)
|
145 |
new_df_rows = []
|
146 |
+
default_voice, default_speed, default_instr = APP_AVAILABLE_VOICES[0], "1.0", ""
|
|
|
|
|
|
|
147 |
|
148 |
for speaker in unique_speakers:
|
149 |
+
s_cfg = current_config_map.get(speaker, {})
|
150 |
new_df_rows.append([
|
151 |
speaker,
|
152 |
+
s_cfg.get("voice", default_voice),
|
153 |
+
s_cfg.get("speed", default_speed),
|
154 |
+
s_cfg.get("instructions", default_instr)
|
155 |
])
|
156 |
|
157 |
+
speed_info = " (tts-1/hd)" if selected_tts_model in ["tts-1", "tts-1-hd"] else " (N/A)"
|
158 |
+
instr_info = " (gpt-4o-mini)" if selected_tts_model == "gpt-4o-mini-tts" else " (N/A)"
|
|
|
159 |
|
160 |
+
dynamic_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
|
161 |
+
f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
|
162 |
+
f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
+
if not new_df_rows:
|
165 |
+
new_df_rows.append(["(No speakers detected in script)", default_voice, default_speed, default_instr])
|
166 |
+
|
167 |
+
return gr.update(value=pd.DataFrame(new_df_rows, columns=DATAFRAME_HEADERS), headers=dynamic_headers)
|
168 |
|
|
|
|
|
|
|
|
|
169 |
|
170 |
def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
|
171 |
+
global_selected_voice, detailed_config_map_from_table): # Renamed last param
|
172 |
global _speaker_config_cache
|
173 |
if _speaker_config_cache.get("__method") != speaker_config_method or \
|
174 |
_speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
|
|
|
180 |
base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
|
181 |
return base_config
|
182 |
|
183 |
+
if speaker_config_method == "Detailed Configuration (Table)":
|
184 |
+
if speaker_name in detailed_config_map_from_table:
|
185 |
+
speaker_specific = detailed_config_map_from_table[speaker_name]
|
186 |
+
return {
|
|
|
187 |
"voice": speaker_specific.get("voice", base_config["voice"]),
|
188 |
"speed": speaker_specific.get("speed"),
|
189 |
"instructions": speaker_specific.get("instructions")
|
190 |
}
|
191 |
else:
|
192 |
+
print(f"Warning: Speaker '{speaker_name}' not in Table Config. Using default.")
|
193 |
return base_config
|
194 |
|
195 |
+
# For "Random" and "A/B" (same as before)
|
196 |
if speaker_name not in _speaker_config_cache:
|
197 |
if speaker_config_method == "Random per Speaker":
|
|
|
198 |
available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
|
199 |
if not _speaker_config_cache.get("__all_assigned_random"):
|
200 |
for i, spk_unique in enumerate(unique_script_speakers):
|
|
|
205 |
_speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
|
206 |
|
207 |
elif speaker_config_method == "A/B Round Robin":
|
|
|
208 |
if not _speaker_config_cache.get("__all_assigned_ab"):
|
209 |
for i, spk_unique in enumerate(unique_script_speakers):
|
210 |
if spk_unique not in _speaker_config_cache:
|
|
|
223 |
async def handle_script_processing(
|
224 |
dialogue_script: str, tts_model: str, pause_ms: int,
|
225 |
speaker_config_method: str, global_voice_selection: str,
|
226 |
+
speaker_details_df_value: pd.DataFrame | dict, # Can be pd.DataFrame or dict from Gradio
|
227 |
global_speed: float,
|
228 |
global_instructions: str, progress=gr.Progress(track_tqdm=True)):
|
229 |
+
global _speaker_config_cache; _speaker_config_cache = {}
|
|
|
230 |
|
231 |
+
if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
|
232 |
+
if not dialogue_script.strip(): return None, None, "Error: Script empty."
|
|
|
|
|
233 |
|
234 |
job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
|
235 |
if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
|
236 |
os.makedirs(job_audio_path_prefix, exist_ok=True)
|
237 |
|
238 |
try:
|
239 |
+
parsed_lines, _ = parse_dialogue_script(dialogue_script)
|
240 |
+
if not parsed_lines: return None, None, "Error: No valid lines."
|
241 |
+
except ValueError as e: return None, None, f"Script error: {str(e)}"
|
242 |
|
243 |
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
244 |
|
|
|
245 |
parsed_detailed_config_map = {}
|
246 |
if speaker_config_method == "Detailed Configuration (Table)":
|
247 |
try:
|
248 |
parsed_detailed_config_map = convert_df_to_config_map(speaker_details_df_value)
|
249 |
+
except Exception as e: return None, None, f"Error processing Speaker Table: {str(e)}"
|
|
|
|
|
250 |
|
251 |
tasks, line_audio_files = [], [None] * len(parsed_lines)
|
252 |
for i, line_data in enumerate(parsed_lines):
|
253 |
speaker_name = line_data["speaker"]
|
254 |
speaker_base_cfg = get_config_for_speaker(
|
255 |
speaker_name, speaker_config_method, unique_speakers,
|
256 |
+
global_voice_selection, parsed_detailed_config_map
|
257 |
)
|
258 |
line_voice = speaker_base_cfg["voice"]
|
259 |
|
260 |
+
eff_speed = global_speed
|
261 |
+
if speaker_base_cfg.get("speed") is not None: eff_speed = speaker_base_cfg["speed"]
|
262 |
|
263 |
+
eff_instr = global_instructions if global_instructions and global_instructions.strip() else None
|
264 |
if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
|
265 |
+
eff_instr = speaker_base_cfg["instructions"]
|
266 |
|
267 |
+
out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
|
268 |
+
progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
|
269 |
tasks.append(synthesize_speech_line(
|
270 |
client=async_openai_client, text=line_data["text"], voice=line_voice,
|
271 |
+
output_path=out_fn, model=tts_model, speed=eff_speed,
|
272 |
+
instructions=eff_instr, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
|
273 |
line_index=line_data['id']))
|
274 |
|
275 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
276 |
+
for idx, res in enumerate(results):
|
277 |
+
if isinstance(res, Exception): print(f"Error line {parsed_lines[idx]['id']}: {res}")
|
278 |
+
elif res is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
|
279 |
+
else: line_audio_files[idx] = res
|
280 |
|
281 |
+
valid_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
282 |
+
if not valid_files:
|
283 |
+
shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio synthesized."
|
|
|
284 |
|
285 |
+
zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
286 |
+
with zipfile.ZipFile(zip_fn, 'w') as zf: [zf.write(p, os.path.basename(p)) for p in valid_files]
|
|
|
287 |
|
288 |
+
merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
289 |
+
merged_path = merge_mp3_files([f for f in line_audio_files if f], merged_fn, pause_ms)
|
290 |
+
|
291 |
+
status = f"{len(valid_files)}/{len(parsed_lines)} lines. "
|
292 |
+
if len(valid_files) < len(parsed_lines): status += "Some failed. "
|
293 |
+
if not merged_path and len(valid_files) > 0: status += "Merge failed. "
|
294 |
+
elif not merged_path: status += "No audio."
|
295 |
+
else: status += "Generated."
|
296 |
|
297 |
+
return (zip_fn if os.path.exists(zip_fn) else None,
|
298 |
+
merged_path if merged_path and os.path.exists(merged_path) else None,
|
299 |
status)
|
300 |
|
301 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
302 |
+
if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
|
|
|
303 |
try:
|
304 |
parsed, chars = parse_dialogue_script(dialogue_script)
|
305 |
+
if not parsed: return "Cost: $0.00 (No lines)"
|
306 |
cost = calculate_cost(chars, len(parsed), tts_model)
|
307 |
+
return f"Est. Cost: ${cost:.6f}"
|
|
|
308 |
except Exception as e: return f"Cost calc error: {str(e)}"
|
309 |
|
310 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
311 |
+
gr.Markdown("# Dialogue Script to Speech (Table Config)")
|
|
|
312 |
if not OPENAI_API_KEY or not async_openai_client:
|
313 |
+
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
|
314 |
|
315 |
with gr.Row():
|
316 |
with gr.Column(scale=2):
|
317 |
+
script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
|
318 |
with gr.Column(scale=1):
|
319 |
+
tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
|
320 |
+
pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50)
|
321 |
+
global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
|
322 |
+
global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
|
323 |
|
324 |
gr.Markdown("### Speaker Configuration")
|
325 |
with gr.Row():
|
326 |
speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
|
327 |
+
global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
|
|
|
|
|
|
|
328 |
|
329 |
+
# Detailed Table Configuration UI
|
330 |
+
# Visibility of this whole column is controlled by the dropdown selection
|
331 |
+
with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Table)")) as detailed_table_config_group:
|
332 |
+
load_speakers_button = gr.Button("Load/Refresh Speakers into Table (from Script Above)")
|
333 |
+
gr.Markdown(f"<small>Available voices: {', '.join(APP_AVAILABLE_VOICES)}. "
|
334 |
+
"Edit table below. Speed/Instructions column applicability depends on TTS model.</small>")
|
335 |
speaker_details_df = gr.DataFrame(
|
336 |
+
label="Per-Speaker Settings",
|
337 |
+
headers=DATAFRAME_HEADERS, # Initial static headers
|
338 |
+
value=pd.DataFrame([["(Load speakers first or type here)", APP_AVAILABLE_VOICES[0], "1.0", ""]], columns=DATAFRAME_HEADERS),
|
|
|
339 |
interactive=True,
|
340 |
+
row_count=(1, "dynamic"), # Allow dynamic rows, starting with 1
|
341 |
+
col_count=(len(DATAFRAME_HEADERS), "fixed")
|
|
|
|
|
342 |
)
|
|
|
|
|
343 |
|
344 |
with gr.Row():
|
345 |
calculate_cost_button = gr.Button("Calculate Cost")
|
|
|
349 |
with gr.Row():
|
350 |
individual_lines_zip_output = gr.File(label="Download ZIP")
|
351 |
merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
|
352 |
+
status_output = gr.Textbox(label="Status", interactive=False, lines=1)
|
353 |
|
354 |
+
def update_model_controls_visibility(selected_model):
|
|
|
|
|
355 |
is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
|
356 |
is_gpt_mini = selected_model == "gpt-4o-mini-tts"
|
357 |
+
|
358 |
+
speed_info = " (tts-1/hd)" if is_tts1 else " (N/A)"
|
359 |
+
instr_info = " (gpt-4o-mini)" if is_gpt_mini else " (N/A)"
|
360 |
+
new_df_headers = [ DATAFRAME_HEADERS[0], DATAFRAME_HEADERS[1],
|
361 |
+
f"{DATAFRAME_HEADERS[2].split('(')[0].strip()}{speed_info}",
|
362 |
+
f"{DATAFRAME_HEADERS[3].split('(')[0].strip()}{instr_info}" ]
|
363 |
return {
|
364 |
global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
|
365 |
global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
|
366 |
+
speaker_details_df: gr.update(headers=new_df_headers) # Update DF headers dynamically
|
367 |
}
|
368 |
+
tts_model_dropdown.change(
|
369 |
+
fn=update_model_controls_visibility,
|
370 |
+
inputs=[tts_model_dropdown],
|
371 |
+
outputs=[global_speed_input, global_instructions_input, speaker_details_df]
|
372 |
+
)
|
373 |
|
374 |
+
def update_speaker_config_method_visibility(method):
|
375 |
is_single = (method == "Single Voice (Global)")
|
376 |
is_detailed_table = (method == "Detailed Configuration (Table)")
|
377 |
return {
|
378 |
global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
|
379 |
+
detailed_table_config_group: gr.update(visible=is_detailed_table),
|
380 |
}
|
381 |
+
speaker_config_method_dropdown.change(
|
382 |
+
fn=update_speaker_config_method_visibility,
|
383 |
+
inputs=[speaker_config_method_dropdown],
|
384 |
+
outputs=[global_voice_dropdown, detailed_table_config_group]
|
385 |
+
)
|
386 |
|
|
|
387 |
load_speakers_button.click(
|
388 |
fn=load_speakers_for_table,
|
389 |
+
inputs=[script_input, speaker_details_df, tts_model_dropdown],
|
390 |
outputs=[speaker_details_df]
|
391 |
)
|
392 |
|
|
|
394 |
generate_button.click(
|
395 |
fn=handle_script_processing,
|
396 |
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
|
397 |
+
global_voice_dropdown, speaker_details_df,
|
|
|
398 |
global_speed_input, global_instructions_input],
|
399 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
|
400 |
|
401 |
gr.Markdown("## Examples")
|
|
|
|
|
|
|
|
|
|
|
402 |
gr.Examples(
|
403 |
examples=[
|
404 |
+
["[Alice] This table config is new!\n[Bob] Indeed it is.", "tts-1-hd", 300, "Detailed Configuration (Table)", APP_AVAILABLE_VOICES[0], None, 1.0, ""],
|
405 |
+
["[Narrator] Global voice example.", "tts-1", 0, "Single Voice (Global)", "fable", None, 1.2, ""],
|
|
|
406 |
],
|
407 |
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
|
408 |
+
speaker_details_df, global_speed_input, global_instructions_input],
|
|
|
409 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
410 |
fn=handle_script_processing, cache_examples=False)
|
411 |
|
412 |
if __name__ == "__main__":
|
413 |
if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
414 |
+
demo.launch(debug=True)
|