Spaces:
Running
Running
gradio code parameters
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import asyncio
|
@@ -9,7 +10,7 @@ import json
|
|
9 |
from openai import AsyncOpenAI
|
10 |
|
11 |
from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
|
12 |
-
from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
|
13 |
from utils.merge_audio import merge_mp3_files
|
14 |
|
15 |
# --- Configuration ---
|
@@ -17,7 +18,6 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
17 |
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
|
18 |
MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
|
19 |
|
20 |
-
# Ensure API key is available
|
21 |
if not OPENAI_API_KEY:
|
22 |
try:
|
23 |
from huggingface_hub import HfApi
|
@@ -37,12 +37,10 @@ if OPENAI_API_KEY:
|
|
37 |
else:
|
38 |
print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
|
39 |
|
40 |
-
# TTS Models available in the UI
|
41 |
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
42 |
-
if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE:
|
43 |
MODEL_DEFAULT = "tts-1-hd"
|
44 |
|
45 |
-
|
46 |
SPEAKER_CONFIG_METHODS = [
|
47 |
"Single Voice (Global)",
|
48 |
"Random per Speaker",
|
@@ -50,21 +48,15 @@ SPEAKER_CONFIG_METHODS = [
|
|
50 |
"Detailed Configuration (JSON)"
|
51 |
]
|
52 |
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
53 |
-
|
54 |
-
# For UI elements that need the list of voices
|
55 |
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
56 |
-
|
57 |
-
|
58 |
-
# Global store for parsed speaker configurations
|
59 |
_speaker_config_cache = {}
|
60 |
|
61 |
def parse_detailed_speaker_config(json_text, parsed_script_lines):
|
62 |
-
""" Parses the JSON config string into a speaker map. """
|
63 |
config_map = {}
|
64 |
default_voice = APP_AVAILABLE_VOICES[0]
|
65 |
try:
|
66 |
if not json_text.strip():
|
67 |
-
return {}
|
68 |
|
69 |
config_list = json.loads(json_text)
|
70 |
if not isinstance(config_list, list):
|
@@ -72,46 +64,38 @@ def parse_detailed_speaker_config(json_text, parsed_script_lines):
|
|
72 |
|
73 |
for item in config_list:
|
74 |
if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
|
75 |
-
# Log warning or skip malformed item
|
76 |
print(f"Skipping malformed item in JSON config: {item}")
|
77 |
continue
|
78 |
if item["voice"] not in APP_AVAILABLE_VOICES:
|
79 |
print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
|
80 |
item["voice"] = default_voice
|
81 |
|
82 |
-
# Validate speed if present
|
83 |
if "speed" in item:
|
84 |
try:
|
85 |
item["speed"] = float(item["speed"])
|
86 |
if not (0.25 <= item["speed"] <= 4.0):
|
87 |
-
print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping
|
88 |
-
item["speed"] = max(0.25, min(item["speed"], 4.0))
|
89 |
except ValueError:
|
90 |
print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
|
91 |
-
# Let it be None or remove, so global/default speed applies
|
92 |
item.pop("speed", None)
|
93 |
|
94 |
-
|
95 |
config_map[item["speaker"]] = {
|
96 |
"voice": item["voice"],
|
97 |
-
"speed": item.get("speed"),
|
98 |
-
"instructions": item.get("instructions")
|
99 |
}
|
100 |
return config_map
|
101 |
except json.JSONDecodeError as e:
|
102 |
raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
|
103 |
-
except ValueError as e:
|
104 |
-
raise e
|
105 |
-
except Exception as e:
|
106 |
raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
|
107 |
|
108 |
-
|
109 |
def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
|
110 |
global_selected_voice, detailed_config_map):
|
111 |
-
global _speaker_config_cache
|
112 |
-
|
113 |
-
# If method changed or cache is for a different set of speakers, reset it.
|
114 |
-
# A more robust cache key might involve hashing unique_script_speakers.
|
115 |
if _speaker_config_cache.get("__method") != speaker_config_method or \
|
116 |
_speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
|
117 |
_speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
|
@@ -124,72 +108,51 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
|
|
124 |
|
125 |
if speaker_config_method == "Detailed Configuration (JSON)":
|
126 |
if speaker_name in detailed_config_map:
|
127 |
-
# Merge with base_config to ensure all keys are present if some are optional in JSON
|
128 |
-
# JSON values take precedence
|
129 |
speaker_specific = detailed_config_map[speaker_name]
|
130 |
return {
|
131 |
"voice": speaker_specific.get("voice", base_config["voice"]),
|
132 |
-
"speed": speaker_specific.get("speed"),
|
133 |
-
"instructions": speaker_specific.get("instructions")
|
134 |
}
|
135 |
-
else:
|
136 |
-
# Could use a default voice, or a cycling voice for unmapped speakers
|
137 |
-
# For now, let's use the first voice as a simple fallback.
|
138 |
print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
|
139 |
return base_config
|
140 |
|
141 |
-
|
142 |
-
# For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
|
143 |
if speaker_name not in _speaker_config_cache:
|
144 |
if speaker_config_method == "Random per Speaker":
|
145 |
-
# Assign a random voice if not already cached for this run
|
146 |
available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
|
147 |
-
# Ensure all unique speakers get an assignment before reusing voices from cache build
|
148 |
if not _speaker_config_cache.get("__all_assigned_random"):
|
149 |
for i, spk_unique in enumerate(unique_script_speakers):
|
150 |
if spk_unique not in _speaker_config_cache:
|
151 |
_speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
|
152 |
_speaker_config_cache["__all_assigned_random"] = True
|
153 |
-
|
154 |
-
# If somehow still not found (should not happen if pre-populated)
|
155 |
if speaker_name not in _speaker_config_cache:
|
156 |
_speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
|
157 |
-
|
158 |
elif speaker_config_method == "A/B Round Robin":
|
159 |
-
# Assign voices in order if not already cached
|
160 |
if not _speaker_config_cache.get("__all_assigned_ab"):
|
161 |
for i, spk_unique in enumerate(unique_script_speakers):
|
162 |
if spk_unique not in _speaker_config_cache:
|
163 |
_speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
|
164 |
_speaker_config_cache["__all_assigned_ab"] = True
|
165 |
-
|
166 |
-
if speaker_name not in _speaker_config_cache: # Fallback, should be populated
|
167 |
speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
|
168 |
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
|
169 |
-
else:
|
170 |
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
|
171 |
|
172 |
-
# Return only voice for Random/AB, speed/instructions will be from global UI inputs
|
173 |
cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
|
174 |
return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
|
175 |
|
176 |
-
|
177 |
async def handle_script_processing(
|
178 |
-
dialogue_script: str,
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
global_voice_selection: str, # From dropdown if "Single Voice (Global)"
|
183 |
-
detailed_speaker_json: str, # From JSON input
|
184 |
-
global_speed: float,
|
185 |
-
global_instructions: str,
|
186 |
-
progress=gr.Progress(track_tqdm=True)
|
187 |
-
):
|
188 |
global _speaker_config_cache
|
189 |
-
_speaker_config_cache = {}
|
190 |
|
191 |
if not OPENAI_API_KEY or not async_openai_client:
|
192 |
-
return None, None, "Error: OPENAI_API_KEY is not configured.
|
193 |
if not dialogue_script.strip():
|
194 |
return None, None, "Error: Script is empty."
|
195 |
|
@@ -213,266 +176,159 @@ async def handle_script_processing(
|
|
213 |
except ValueError as e:
|
214 |
return None, None, f"Configuration Error: {str(e)}"
|
215 |
|
216 |
-
tasks = []
|
217 |
-
line_audio_files = [None] * len(parsed_lines)
|
218 |
-
|
219 |
for i, line_data in enumerate(parsed_lines):
|
220 |
speaker_name = line_data["speaker"]
|
221 |
-
|
222 |
-
# Get base config (primarily voice) based on method
|
223 |
-
# For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
|
224 |
-
# For "Single Voice", it returns the globally selected voice.
|
225 |
-
# For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
|
226 |
speaker_base_cfg = get_config_for_speaker(
|
227 |
speaker_name, speaker_config_method, unique_speakers,
|
228 |
-
global_voice_selection, parsed_detailed_config_map
|
229 |
-
)
|
230 |
-
|
231 |
line_voice = speaker_base_cfg["voice"]
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
effective_speed = global_speed # Start with global
|
236 |
-
if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
|
237 |
effective_speed = speaker_base_cfg["speed"]
|
238 |
|
239 |
effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
|
240 |
-
if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
|
241 |
effective_instructions = speaker_base_cfg["instructions"]
|
242 |
|
243 |
output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
|
244 |
progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
voice=line_voice,
|
251 |
-
output_path=output_filename,
|
252 |
-
model=tts_model,
|
253 |
-
speed=effective_speed,
|
254 |
-
instructions=effective_instructions,
|
255 |
-
nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
|
256 |
-
line_index=line_data['id']
|
257 |
-
)
|
258 |
-
)
|
259 |
|
260 |
synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
|
261 |
-
|
262 |
for idx, result in enumerate(synthesis_results):
|
263 |
-
if isinstance(result, Exception):
|
264 |
-
|
265 |
-
|
266 |
-
print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
|
267 |
-
else:
|
268 |
-
line_audio_files[idx] = result
|
269 |
|
270 |
valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
271 |
-
|
272 |
if not valid_audio_files:
|
273 |
shutil.rmtree(job_audio_path_prefix)
|
274 |
-
return None, None, "Error: No audio files
|
275 |
|
276 |
zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
277 |
with zipfile.ZipFile(zip_filename, 'w') as zf:
|
278 |
-
for
|
279 |
-
zf.write(audio_file_path, os.path.basename(audio_file_path))
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
if len(valid_audio_files)
|
287 |
-
|
288 |
-
|
289 |
-
final_status += "Merged audio could not be created."
|
290 |
-
elif not merged_output_path and len(valid_audio_files) == 0:
|
291 |
-
final_status += "No audio generated."
|
292 |
-
else:
|
293 |
-
final_status += "Outputs generated."
|
294 |
|
295 |
-
return zip_filename if os.path.exists(zip_filename) else None,
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
|
300 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
301 |
-
if not dialogue_script.strip():
|
302 |
-
return "Cost: $0.000000 (Script is empty)"
|
303 |
try:
|
304 |
-
|
305 |
-
if not
|
306 |
-
|
307 |
-
cost
|
308 |
-
|
309 |
-
except
|
310 |
-
return f"Error: {str(e)}"
|
311 |
-
except Exception as e:
|
312 |
-
return f"Error calculating cost: {str(e)}"
|
313 |
|
314 |
-
# --- Gradio Interface ---
|
315 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
316 |
gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
|
317 |
-
gr.Markdown(
|
318 |
-
"Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
|
319 |
-
)
|
320 |
if not OPENAI_API_KEY or not async_openai_client:
|
321 |
-
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY
|
322 |
|
323 |
with gr.Row():
|
324 |
with gr.Column(scale=2):
|
325 |
-
script_input = gr.TextArea(
|
326 |
-
label="Dialogue Script",
|
327 |
-
placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
|
328 |
-
lines=10,
|
329 |
-
info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
|
330 |
-
)
|
331 |
with gr.Column(scale=1):
|
332 |
-
tts_model_dropdown = gr.Dropdown(
|
333 |
-
|
334 |
-
|
335 |
-
)
|
336 |
-
pause_input = gr.Number(
|
337 |
-
label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
|
338 |
-
info="Silence duration in milliseconds between merged lines."
|
339 |
-
)
|
340 |
-
# Model-specific global controls
|
341 |
-
global_speed_input = gr.Slider(
|
342 |
-
minimum=0.25, maximum=4.0, value=1.0, step=0.05,
|
343 |
-
label="Global Speech Speed (for tts-1/tts-1-hd)",
|
344 |
-
visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
|
345 |
-
interactive=True
|
346 |
-
)
|
347 |
-
global_instructions_input = gr.Textbox(
|
348 |
-
label="Global Voice Instructions (for gpt-4o-mini-tts)",
|
349 |
-
placeholder="e.g., Speak in a calm, reassuring tone.",
|
350 |
-
visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
|
351 |
-
interactive=True, lines=2
|
352 |
-
)
|
353 |
|
354 |
gr.Markdown("### Speaker Configuration")
|
355 |
with gr.Row():
|
356 |
-
speaker_config_method_dropdown = gr.Dropdown(
|
357 |
-
|
358 |
-
)
|
359 |
-
global_voice_dropdown = gr.Dropdown(
|
360 |
-
APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
|
361 |
-
value=APP_AVAILABLE_VOICES[0],
|
362 |
-
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
|
363 |
-
interactive=True
|
364 |
-
)
|
365 |
|
366 |
-
initial_json_config_example = """
|
367 |
-
[
|
368 |
{"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
|
369 |
{"speaker": "Bob", "voice": "echo"},
|
370 |
{"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
|
371 |
-
]
|
372 |
-
""".strip()
|
373 |
|
374 |
detailed_speaker_config_input = gr.Code(
|
375 |
label="Detailed Speaker Configuration (JSON)",
|
376 |
language="json",
|
377 |
lines=7,
|
378 |
-
value=initial_json_config_example,
|
379 |
-
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
|
380 |
-
|
|
|
|
|
|
|
|
|
|
|
381 |
)
|
382 |
|
383 |
with gr.Row():
|
384 |
-
calculate_cost_button = gr.Button("Calculate
|
385 |
-
generate_button = gr.Button("Generate Audio
|
386 |
|
387 |
cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
|
388 |
-
|
389 |
with gr.Row():
|
390 |
-
individual_lines_zip_output = gr.File(label="Download ZIP
|
391 |
-
merged_dialogue_mp3_output = gr.Audio(label="Merged
|
392 |
-
|
393 |
status_output = gr.Textbox(label="Status", interactive=False, lines=2)
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
|
399 |
return {
|
400 |
-
global_speed_input: gr.update(visible=
|
401 |
global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
|
402 |
}
|
|
|
403 |
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
outputs=[global_speed_input, global_instructions_input]
|
408 |
-
)
|
409 |
-
|
410 |
-
def update_speaker_config_visibility(config_method):
|
411 |
-
is_single_voice = (config_method == "Single Voice (Global)")
|
412 |
-
is_detailed_json = (config_method == "Detailed Configuration (JSON)")
|
413 |
return {
|
414 |
-
global_voice_dropdown: gr.update(visible=
|
415 |
-
detailed_speaker_config_input: gr.update(visible=
|
|
|
416 |
}
|
417 |
-
|
418 |
-
speaker_config_method_dropdown.change(
|
419 |
-
fn=update_speaker_config_visibility,
|
420 |
-
inputs=[speaker_config_method_dropdown],
|
421 |
-
outputs=[global_voice_dropdown, detailed_speaker_config_input]
|
422 |
-
)
|
423 |
|
424 |
-
|
425 |
-
calculate_cost_button.click(
|
426 |
-
fn=handle_calculate_cost,
|
427 |
-
inputs=[script_input, tts_model_dropdown],
|
428 |
-
outputs=[cost_output]
|
429 |
-
)
|
430 |
-
|
431 |
generate_button.click(
|
432 |
fn=handle_script_processing,
|
433 |
-
inputs=[
|
434 |
-
|
435 |
-
|
436 |
-
global_speed_input, global_instructions_input
|
437 |
-
],
|
438 |
-
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
439 |
-
)
|
440 |
|
441 |
gr.Markdown("## Examples")
|
442 |
-
|
443 |
-
|
444 |
-
[
|
445 |
-
|
446 |
-
{"speaker": "Bob", "voice": "echo", "speed": 0.9}
|
447 |
-
]
|
448 |
-
""".strip()
|
449 |
-
|
450 |
-
example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
|
451 |
-
example_json_2 = """
|
452 |
-
[
|
453 |
-
{"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
|
454 |
-
{"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
|
455 |
-
]
|
456 |
-
""".strip()
|
457 |
-
|
458 |
-
|
459 |
gr.Examples(
|
460 |
examples=[
|
461 |
-
[
|
462 |
-
[
|
463 |
-
["[
|
464 |
-
|
465 |
-
|
466 |
-
script_input, tts_model_dropdown, pause_input,
|
467 |
-
speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
|
468 |
-
global_speed_input, global_instructions_input
|
469 |
-
],
|
470 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
471 |
-
fn=handle_script_processing,
|
472 |
-
cache_examples=False,
|
473 |
-
)
|
474 |
|
475 |
if __name__ == "__main__":
|
476 |
-
if os.name == 'nt':
|
477 |
-
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
478 |
demo.launch()
|
|
|
1 |
+
/// --- app.py --- ///
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
import asyncio
|
|
|
10 |
from openai import AsyncOpenAI
|
11 |
|
12 |
from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
|
13 |
+
from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
|
14 |
from utils.merge_audio import merge_mp3_files
|
15 |
|
16 |
# --- Configuration ---
|
|
|
18 |
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
|
19 |
MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
|
20 |
|
|
|
21 |
if not OPENAI_API_KEY:
|
22 |
try:
|
23 |
from huggingface_hub import HfApi
|
|
|
37 |
else:
|
38 |
print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
|
39 |
|
|
|
40 |
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
41 |
+
if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE:
|
42 |
MODEL_DEFAULT = "tts-1-hd"
|
43 |
|
|
|
44 |
SPEAKER_CONFIG_METHODS = [
|
45 |
"Single Voice (Global)",
|
46 |
"Random per Speaker",
|
|
|
48 |
"Detailed Configuration (JSON)"
|
49 |
]
|
50 |
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
|
|
|
|
51 |
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
|
|
|
|
|
|
52 |
_speaker_config_cache = {}
|
53 |
|
54 |
def parse_detailed_speaker_config(json_text, parsed_script_lines):
|
|
|
55 |
config_map = {}
|
56 |
default_voice = APP_AVAILABLE_VOICES[0]
|
57 |
try:
|
58 |
if not json_text.strip():
|
59 |
+
return {}
|
60 |
|
61 |
config_list = json.loads(json_text)
|
62 |
if not isinstance(config_list, list):
|
|
|
64 |
|
65 |
for item in config_list:
|
66 |
if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
|
|
|
67 |
print(f"Skipping malformed item in JSON config: {item}")
|
68 |
continue
|
69 |
if item["voice"] not in APP_AVAILABLE_VOICES:
|
70 |
print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
|
71 |
item["voice"] = default_voice
|
72 |
|
|
|
73 |
if "speed" in item:
|
74 |
try:
|
75 |
item["speed"] = float(item["speed"])
|
76 |
if not (0.25 <= item["speed"] <= 4.0):
|
77 |
+
print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping.")
|
78 |
+
item["speed"] = max(0.25, min(item["speed"], 4.0))
|
79 |
except ValueError:
|
80 |
print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
|
|
|
81 |
item.pop("speed", None)
|
82 |
|
|
|
83 |
config_map[item["speaker"]] = {
|
84 |
"voice": item["voice"],
|
85 |
+
"speed": item.get("speed"),
|
86 |
+
"instructions": item.get("instructions")
|
87 |
}
|
88 |
return config_map
|
89 |
except json.JSONDecodeError as e:
|
90 |
raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
|
91 |
+
except ValueError as e:
|
92 |
+
raise e
|
93 |
+
except Exception as e:
|
94 |
raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
|
95 |
|
|
|
96 |
def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
|
97 |
global_selected_voice, detailed_config_map):
|
98 |
+
global _speaker_config_cache
|
|
|
|
|
|
|
99 |
if _speaker_config_cache.get("__method") != speaker_config_method or \
|
100 |
_speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
|
101 |
_speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
|
|
|
108 |
|
109 |
if speaker_config_method == "Detailed Configuration (JSON)":
|
110 |
if speaker_name in detailed_config_map:
|
|
|
|
|
111 |
speaker_specific = detailed_config_map[speaker_name]
|
112 |
return {
|
113 |
"voice": speaker_specific.get("voice", base_config["voice"]),
|
114 |
+
"speed": speaker_specific.get("speed"),
|
115 |
+
"instructions": speaker_specific.get("instructions")
|
116 |
}
|
117 |
+
else:
|
|
|
|
|
118 |
print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
|
119 |
return base_config
|
120 |
|
|
|
|
|
121 |
if speaker_name not in _speaker_config_cache:
|
122 |
if speaker_config_method == "Random per Speaker":
|
|
|
123 |
available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
|
|
|
124 |
if not _speaker_config_cache.get("__all_assigned_random"):
|
125 |
for i, spk_unique in enumerate(unique_script_speakers):
|
126 |
if spk_unique not in _speaker_config_cache:
|
127 |
_speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
|
128 |
_speaker_config_cache["__all_assigned_random"] = True
|
|
|
|
|
129 |
if speaker_name not in _speaker_config_cache:
|
130 |
_speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
|
|
|
131 |
elif speaker_config_method == "A/B Round Robin":
|
|
|
132 |
if not _speaker_config_cache.get("__all_assigned_ab"):
|
133 |
for i, spk_unique in enumerate(unique_script_speakers):
|
134 |
if spk_unique not in _speaker_config_cache:
|
135 |
_speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
|
136 |
_speaker_config_cache["__all_assigned_ab"] = True
|
137 |
+
if speaker_name not in _speaker_config_cache:
|
|
|
138 |
speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
|
139 |
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
|
140 |
+
else:
|
141 |
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
|
142 |
|
|
|
143 |
cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
|
144 |
return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
|
145 |
|
|
|
146 |
async def handle_script_processing(
|
147 |
+
dialogue_script: str, tts_model: str, pause_ms: int,
|
148 |
+
speaker_config_method: str, global_voice_selection: str,
|
149 |
+
detailed_speaker_json: str, global_speed: float,
|
150 |
+
global_instructions: str, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
global _speaker_config_cache
|
152 |
+
_speaker_config_cache = {}
|
153 |
|
154 |
if not OPENAI_API_KEY or not async_openai_client:
|
155 |
+
return None, None, "Error: OPENAI_API_KEY is not configured."
|
156 |
if not dialogue_script.strip():
|
157 |
return None, None, "Error: Script is empty."
|
158 |
|
|
|
176 |
except ValueError as e:
|
177 |
return None, None, f"Configuration Error: {str(e)}"
|
178 |
|
179 |
+
tasks, line_audio_files = [], [None] * len(parsed_lines)
|
|
|
|
|
180 |
for i, line_data in enumerate(parsed_lines):
|
181 |
speaker_name = line_data["speaker"]
|
|
|
|
|
|
|
|
|
|
|
182 |
speaker_base_cfg = get_config_for_speaker(
|
183 |
speaker_name, speaker_config_method, unique_speakers,
|
184 |
+
global_voice_selection, parsed_detailed_config_map)
|
|
|
|
|
185 |
line_voice = speaker_base_cfg["voice"]
|
186 |
|
187 |
+
effective_speed = global_speed
|
188 |
+
if speaker_base_cfg.get("speed") is not None:
|
|
|
|
|
189 |
effective_speed = speaker_base_cfg["speed"]
|
190 |
|
191 |
effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
|
192 |
+
if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
|
193 |
effective_instructions = speaker_base_cfg["instructions"]
|
194 |
|
195 |
output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
|
196 |
progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
|
197 |
+
tasks.append(synthesize_speech_line(
|
198 |
+
client=async_openai_client, text=line_data["text"], voice=line_voice,
|
199 |
+
output_path=output_filename, model=tts_model, speed=effective_speed,
|
200 |
+
instructions=effective_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
|
201 |
+
line_index=line_data['id']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
204 |
for idx, result in enumerate(synthesis_results):
|
205 |
+
if isinstance(result, Exception): print(f"Error for line {parsed_lines[idx]['id']}: {result}")
|
206 |
+
elif result is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
|
207 |
+
else: line_audio_files[idx] = result
|
|
|
|
|
|
|
208 |
|
209 |
valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
|
|
210 |
if not valid_audio_files:
|
211 |
shutil.rmtree(job_audio_path_prefix)
|
212 |
+
return None, None, "Error: No audio files successfully synthesized."
|
213 |
|
214 |
zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
215 |
with zipfile.ZipFile(zip_filename, 'w') as zf:
|
216 |
+
for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
|
|
|
217 |
|
218 |
+
merged_mp3_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
219 |
+
merged_out_path = merge_mp3_files([f for f in line_audio_files if f], merged_mp3_fn, pause_ms)
|
220 |
+
|
221 |
+
status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
|
222 |
+
if len(valid_audio_files) < len(parsed_lines): status += "Some lines failed/skipped. "
|
223 |
+
if not merged_out_path and len(valid_audio_files) > 0: status += "Merged audio failed. "
|
224 |
+
elif not merged_out_path and len(valid_audio_files) == 0: status += "No audio generated."
|
225 |
+
else: status += "Outputs generated."
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
+
return (zip_filename if os.path.exists(zip_filename) else None,
|
228 |
+
merged_out_path if merged_out_path and os.path.exists(merged_out_path) else None,
|
229 |
+
status)
|
|
|
230 |
|
231 |
def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
232 |
+
if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
|
|
|
233 |
try:
|
234 |
+
parsed, chars = parse_dialogue_script(dialogue_script)
|
235 |
+
if not parsed: return "Cost: $0.000000 (No valid lines)"
|
236 |
+
cost = calculate_cost(chars, len(parsed), tts_model)
|
237 |
+
return f"Est. Cost: ${cost:.6f} ({chars} chars, {len(parsed)} lines)"
|
238 |
+
except ValueError as e: return f"Error: {str(e)}"
|
239 |
+
except Exception as e: return f"Cost calc error: {str(e)}"
|
|
|
|
|
|
|
240 |
|
|
|
241 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
242 |
gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
|
243 |
+
gr.Markdown("Convert scripts with control over voices, speed, and instructions.")
|
|
|
|
|
244 |
if not OPENAI_API_KEY or not async_openai_client:
|
245 |
+
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
|
246 |
|
247 |
with gr.Row():
|
248 |
with gr.Column(scale=2):
|
249 |
+
script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10, info=f"Max {MAX_SCRIPT_LENGTH} chars.")
|
|
|
|
|
|
|
|
|
|
|
250 |
with gr.Column(scale=1):
|
251 |
+
tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
|
252 |
+
pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
|
253 |
+
global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/tts-1-hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
|
254 |
+
global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini-tts)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
gr.Markdown("### Speaker Configuration")
|
257 |
with gr.Row():
|
258 |
+
speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
|
259 |
+
global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice' method)", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
+
initial_json_config_example = """[
|
|
|
262 |
{"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
|
263 |
{"speaker": "Bob", "voice": "echo"},
|
264 |
{"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
|
265 |
+
]""".strip()
|
|
|
266 |
|
267 |
detailed_speaker_config_input = gr.Code(
|
268 |
label="Detailed Speaker Configuration (JSON)",
|
269 |
language="json",
|
270 |
lines=7,
|
271 |
+
value=initial_json_config_example,
|
272 |
+
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
|
273 |
+
)
|
274 |
+
# New Markdown for info text, visibility tied to dropdown
|
275 |
+
detailed_config_info_md = gr.Markdown(
|
276 |
+
"<small>Define voice per speaker. Optionally, `speed` (0.25-4.0) for `tts-1`/`tts-1-hd` models, "
|
277 |
+
"and `instructions` (text) for `gpt-4o-mini-tts`.</small>",
|
278 |
+
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
|
279 |
)
|
280 |
|
281 |
with gr.Row():
|
282 |
+
calculate_cost_button = gr.Button("Calculate Cost")
|
283 |
+
generate_button = gr.Button("Generate Audio", variant="primary")
|
284 |
|
285 |
cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
|
|
|
286 |
with gr.Row():
|
287 |
+
individual_lines_zip_output = gr.File(label="Download ZIP")
|
288 |
+
merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
|
|
|
289 |
status_output = gr.Textbox(label="Status", interactive=False, lines=2)
|
290 |
|
291 |
+
def update_model_controls(selected_model):
|
292 |
+
is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
|
293 |
+
is_gpt_mini = selected_model == "gpt-4o-mini-tts"
|
|
|
294 |
return {
|
295 |
+
global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
|
296 |
global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
|
297 |
}
|
298 |
+
tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input])
|
299 |
|
300 |
+
def update_speaker_controls(method):
|
301 |
+
is_single = (method == "Single Voice (Global)")
|
302 |
+
is_detailed = (method == "Detailed Configuration (JSON)")
|
|
|
|
|
|
|
|
|
|
|
|
|
303 |
return {
|
304 |
+
global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
|
305 |
+
detailed_speaker_config_input: gr.update(visible=is_detailed, interactive=is_detailed),
|
306 |
+
detailed_config_info_md: gr.update(visible=is_detailed) # Control visibility of new Markdown
|
307 |
}
|
308 |
+
speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_speaker_config_input, detailed_config_info_md])
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
+
calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
generate_button.click(
|
312 |
fn=handle_script_processing,
|
313 |
+
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
|
314 |
+
global_voice_dropdown, detailed_speaker_config_input, global_speed_input, global_instructions_input],
|
315 |
+
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
|
|
|
|
|
|
|
|
|
316 |
|
317 |
gr.Markdown("## Examples")
|
318 |
+
ex_script1 = "[A] Hi B.\n[B] Hey A.\n[A] What's up?"
|
319 |
+
ex_json1 = '[{"speaker": "A", "voice": "nova", "instr...": "curious"}, {"speaker": "B", "voice": "echo", "speed": 0.9}]'.replace("instr...", "instructions") # Hack for brevity
|
320 |
+
ex_script2 = "[Cpt] Status?\n[Comp] Nominal. Slow.\n[Cpt] Good."
|
321 |
+
ex_json2 = '[{"speaker": "Cpt", "voice": "alloy"}, {"speaker": "Comp", "voice": "onyx", "speed": 0.8, "instr...": "robotic"}]'.replace("instr...", "instructions")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
gr.Examples(
|
323 |
examples=[
|
324 |
+
[ex_script1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json1, 1.0, "Speak naturally."],
|
325 |
+
[ex_script2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json2, 1.1, ""],
|
326 |
+
["[N] Single line.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]],
|
327 |
+
inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
|
328 |
+
detailed_speaker_config_input, global_speed_input, global_instructions_input],
|
|
|
|
|
|
|
|
|
329 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
330 |
+
fn=handle_script_processing, cache_examples=False)
|
|
|
|
|
331 |
|
332 |
if __name__ == "__main__":
|
333 |
+
if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
|
334 |
demo.launch()
|