abocha commited on
Commit
354d940
·
1 Parent(s): 7a4548f

gradio code parameters

Browse files
Files changed (1) hide show
  1. app.py +106 -250
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import os
3
  import asyncio
@@ -9,7 +10,7 @@ import json
9
  from openai import AsyncOpenAI
10
 
11
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
12
- from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES # Renamed for clarity
13
  from utils.merge_audio import merge_mp3_files
14
 
15
  # --- Configuration ---
@@ -17,7 +18,6 @@ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
  NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
18
  MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
19
 
20
- # Ensure API key is available
21
  if not OPENAI_API_KEY:
22
  try:
23
  from huggingface_hub import HfApi
@@ -37,12 +37,10 @@ if OPENAI_API_KEY:
37
  else:
38
  print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
39
 
40
- # TTS Models available in the UI
41
  TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
42
- if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE: # Ensure default is valid
43
  MODEL_DEFAULT = "tts-1-hd"
44
 
45
-
46
  SPEAKER_CONFIG_METHODS = [
47
  "Single Voice (Global)",
48
  "Random per Speaker",
@@ -50,21 +48,15 @@ SPEAKER_CONFIG_METHODS = [
50
  "Detailed Configuration (JSON)"
51
  ]
52
  DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
53
-
54
- # For UI elements that need the list of voices
55
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
56
-
57
-
58
- # Global store for parsed speaker configurations
59
  _speaker_config_cache = {}
60
 
61
  def parse_detailed_speaker_config(json_text, parsed_script_lines):
62
- """ Parses the JSON config string into a speaker map. """
63
  config_map = {}
64
  default_voice = APP_AVAILABLE_VOICES[0]
65
  try:
66
  if not json_text.strip():
67
- return {} # Empty config means rely on global or other fallbacks
68
 
69
  config_list = json.loads(json_text)
70
  if not isinstance(config_list, list):
@@ -72,46 +64,38 @@ def parse_detailed_speaker_config(json_text, parsed_script_lines):
72
 
73
  for item in config_list:
74
  if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
75
- # Log warning or skip malformed item
76
  print(f"Skipping malformed item in JSON config: {item}")
77
  continue
78
  if item["voice"] not in APP_AVAILABLE_VOICES:
79
  print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
80
  item["voice"] = default_voice
81
 
82
- # Validate speed if present
83
  if "speed" in item:
84
  try:
85
  item["speed"] = float(item["speed"])
86
  if not (0.25 <= item["speed"] <= 4.0):
87
- print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping or defaulting.")
88
- item["speed"] = max(0.25, min(item["speed"], 4.0)) # Clamp
89
  except ValueError:
90
  print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
91
- # Let it be None or remove, so global/default speed applies
92
  item.pop("speed", None)
93
 
94
-
95
  config_map[item["speaker"]] = {
96
  "voice": item["voice"],
97
- "speed": item.get("speed"), # Will be None if not present or invalid
98
- "instructions": item.get("instructions") # Will be None if not present
99
  }
100
  return config_map
101
  except json.JSONDecodeError as e:
102
  raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
103
- except ValueError as e: # Catch our own ValueErrors
104
- raise e # Re-raise
105
- except Exception as e: # Catch any other unexpected errors during parsing
106
  raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
107
 
108
-
109
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
110
  global_selected_voice, detailed_config_map):
111
- global _speaker_config_cache # This cache helps maintain consistency for "Random" and "A/B" within a run
112
-
113
- # If method changed or cache is for a different set of speakers, reset it.
114
- # A more robust cache key might involve hashing unique_script_speakers.
115
  if _speaker_config_cache.get("__method") != speaker_config_method or \
116
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
117
  _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
@@ -124,72 +108,51 @@ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_sp
124
 
125
  if speaker_config_method == "Detailed Configuration (JSON)":
126
  if speaker_name in detailed_config_map:
127
- # Merge with base_config to ensure all keys are present if some are optional in JSON
128
- # JSON values take precedence
129
  speaker_specific = detailed_config_map[speaker_name]
130
  return {
131
  "voice": speaker_specific.get("voice", base_config["voice"]),
132
- "speed": speaker_specific.get("speed"), # Allow None to use global
133
- "instructions": speaker_specific.get("instructions") # Allow None to use global
134
  }
135
- else: # Fallback for speakers in script but not in JSON map
136
- # Could use a default voice, or a cycling voice for unmapped speakers
137
- # For now, let's use the first voice as a simple fallback.
138
  print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
139
  return base_config
140
 
141
-
142
- # For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
143
  if speaker_name not in _speaker_config_cache:
144
  if speaker_config_method == "Random per Speaker":
145
- # Assign a random voice if not already cached for this run
146
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
147
- # Ensure all unique speakers get an assignment before reusing voices from cache build
148
  if not _speaker_config_cache.get("__all_assigned_random"):
149
  for i, spk_unique in enumerate(unique_script_speakers):
150
  if spk_unique not in _speaker_config_cache:
151
  _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
152
  _speaker_config_cache["__all_assigned_random"] = True
153
-
154
- # If somehow still not found (should not happen if pre-populated)
155
  if speaker_name not in _speaker_config_cache:
156
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
157
-
158
  elif speaker_config_method == "A/B Round Robin":
159
- # Assign voices in order if not already cached
160
  if not _speaker_config_cache.get("__all_assigned_ab"):
161
  for i, spk_unique in enumerate(unique_script_speakers):
162
  if spk_unique not in _speaker_config_cache:
163
  _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
164
  _speaker_config_cache["__all_assigned_ab"] = True
165
-
166
- if speaker_name not in _speaker_config_cache: # Fallback, should be populated
167
  speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
168
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
169
- else: # Should not happen
170
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
171
 
172
- # Return only voice for Random/AB, speed/instructions will be from global UI inputs
173
  cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
174
  return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
175
 
176
-
177
  async def handle_script_processing(
178
- dialogue_script: str,
179
- tts_model: str,
180
- pause_ms: int,
181
- speaker_config_method: str,
182
- global_voice_selection: str, # From dropdown if "Single Voice (Global)"
183
- detailed_speaker_json: str, # From JSON input
184
- global_speed: float,
185
- global_instructions: str,
186
- progress=gr.Progress(track_tqdm=True)
187
- ):
188
  global _speaker_config_cache
189
- _speaker_config_cache = {} # Reset speaker config cache for each new run
190
 
191
  if not OPENAI_API_KEY or not async_openai_client:
192
- return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
193
  if not dialogue_script.strip():
194
  return None, None, "Error: Script is empty."
195
 
@@ -213,266 +176,159 @@ async def handle_script_processing(
213
  except ValueError as e:
214
  return None, None, f"Configuration Error: {str(e)}"
215
 
216
- tasks = []
217
- line_audio_files = [None] * len(parsed_lines)
218
-
219
  for i, line_data in enumerate(parsed_lines):
220
  speaker_name = line_data["speaker"]
221
-
222
- # Get base config (primarily voice) based on method
223
- # For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
224
- # For "Single Voice", it returns the globally selected voice.
225
- # For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
226
  speaker_base_cfg = get_config_for_speaker(
227
  speaker_name, speaker_config_method, unique_speakers,
228
- global_voice_selection, parsed_detailed_config_map
229
- )
230
-
231
  line_voice = speaker_base_cfg["voice"]
232
 
233
- # Determine effective speed and instructions
234
- # Priority: Per-speaker from JSON > Global UI > API Default (1.0 for speed, None for instructions)
235
- effective_speed = global_speed # Start with global
236
- if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
237
  effective_speed = speaker_base_cfg["speed"]
238
 
239
  effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
240
- if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip(): # If JSON provided instructions
241
  effective_instructions = speaker_base_cfg["instructions"]
242
 
243
  output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
244
  progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
245
-
246
- tasks.append(
247
- synthesize_speech_line(
248
- client=async_openai_client,
249
- text=line_data["text"],
250
- voice=line_voice,
251
- output_path=output_filename,
252
- model=tts_model,
253
- speed=effective_speed,
254
- instructions=effective_instructions,
255
- nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
256
- line_index=line_data['id']
257
- )
258
- )
259
 
260
  synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
261
-
262
  for idx, result in enumerate(synthesis_results):
263
- if isinstance(result, Exception):
264
- print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
265
- elif result is None:
266
- print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
267
- else:
268
- line_audio_files[idx] = result
269
 
270
  valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
271
-
272
  if not valid_audio_files:
273
  shutil.rmtree(job_audio_path_prefix)
274
- return None, None, "Error: No audio files were successfully synthesized."
275
 
276
  zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
277
  with zipfile.ZipFile(zip_filename, 'w') as zf:
278
- for audio_file_path in valid_audio_files:
279
- zf.write(audio_file_path, os.path.basename(audio_file_path))
280
 
281
- ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
282
- merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
283
- merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
284
-
285
- final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
286
- if len(valid_audio_files) < len(parsed_lines):
287
- final_status += "Some lines failed or were skipped. "
288
- if not merged_output_path and len(valid_audio_files) > 0 :
289
- final_status += "Merged audio could not be created."
290
- elif not merged_output_path and len(valid_audio_files) == 0:
291
- final_status += "No audio generated."
292
- else:
293
- final_status += "Outputs generated."
294
 
295
- return zip_filename if os.path.exists(zip_filename) else None, \
296
- merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
297
- final_status
298
-
299
 
300
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
301
- if not dialogue_script.strip():
302
- return "Cost: $0.000000 (Script is empty)"
303
  try:
304
- parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
305
- if not parsed_lines:
306
- return "Cost: $0.000000 (No valid lines)"
307
- cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
308
- return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
309
- except ValueError as e: # Handles MAX_SCRIPT_LENGTH
310
- return f"Error: {str(e)}"
311
- except Exception as e:
312
- return f"Error calculating cost: {str(e)}"
313
 
314
- # --- Gradio Interface ---
315
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
316
  gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
317
- gr.Markdown(
318
- "Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
319
- )
320
  if not OPENAI_API_KEY or not async_openai_client:
321
- gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. Speech synthesis will fail.</h3>")
322
 
323
  with gr.Row():
324
  with gr.Column(scale=2):
325
- script_input = gr.TextArea(
326
- label="Dialogue Script",
327
- placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
328
- lines=10,
329
- info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
330
- )
331
  with gr.Column(scale=1):
332
- tts_model_dropdown = gr.Dropdown(
333
- TTS_MODELS_AVAILABLE, label="OpenAI TTS Model", value=MODEL_DEFAULT,
334
- info="Select TTS model. Affects available controls below."
335
- )
336
- pause_input = gr.Number(
337
- label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
338
- info="Silence duration in milliseconds between merged lines."
339
- )
340
- # Model-specific global controls
341
- global_speed_input = gr.Slider(
342
- minimum=0.25, maximum=4.0, value=1.0, step=0.05,
343
- label="Global Speech Speed (for tts-1/tts-1-hd)",
344
- visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
345
- interactive=True
346
- )
347
- global_instructions_input = gr.Textbox(
348
- label="Global Voice Instructions (for gpt-4o-mini-tts)",
349
- placeholder="e.g., Speak in a calm, reassuring tone.",
350
- visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
351
- interactive=True, lines=2
352
- )
353
 
354
  gr.Markdown("### Speaker Configuration")
355
  with gr.Row():
356
- speaker_config_method_dropdown = gr.Dropdown(
357
- SPEAKER_CONFIG_METHODS, label="Speaker Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
358
- )
359
- global_voice_dropdown = gr.Dropdown(
360
- APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
361
- value=APP_AVAILABLE_VOICES[0],
362
- visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
363
- interactive=True
364
- )
365
 
366
- initial_json_config_example = """
367
- [
368
  {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
369
  {"speaker": "Bob", "voice": "echo"},
370
  {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
371
- ]
372
- """.strip()
373
 
374
  detailed_speaker_config_input = gr.Code(
375
  label="Detailed Speaker Configuration (JSON)",
376
  language="json",
377
  lines=7,
378
- value=initial_json_config_example, # Corrected: Use value for default content
379
- visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)"),
380
- info="Define voice, and optionally model-compatible speed/instructions per speaker."
 
 
 
 
 
381
  )
382
 
383
  with gr.Row():
384
- calculate_cost_button = gr.Button("Calculate Estimated Cost")
385
- generate_button = gr.Button("Generate Audio Files", variant="primary")
386
 
387
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
388
-
389
  with gr.Row():
390
- individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
391
- merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
392
-
393
  status_output = gr.Textbox(label="Status", interactive=False, lines=2)
394
 
395
- # --- Event Handlers for UI Interactivity ---
396
- def update_model_specific_controls_visibility(selected_model):
397
- is_tts_1_family = selected_model in ["tts-1", "tts-1-hd"]
398
- is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
399
  return {
400
- global_speed_input: gr.update(visible=is_tts_1_family, interactive=is_tts_1_family),
401
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
402
  }
 
403
 
404
- tts_model_dropdown.change(
405
- fn=update_model_specific_controls_visibility,
406
- inputs=[tts_model_dropdown],
407
- outputs=[global_speed_input, global_instructions_input]
408
- )
409
-
410
- def update_speaker_config_visibility(config_method):
411
- is_single_voice = (config_method == "Single Voice (Global)")
412
- is_detailed_json = (config_method == "Detailed Configuration (JSON)")
413
  return {
414
- global_voice_dropdown: gr.update(visible=is_single_voice, interactive=is_single_voice),
415
- detailed_speaker_config_input: gr.update(visible=is_detailed_json, interactive=is_detailed_json)
 
416
  }
417
-
418
- speaker_config_method_dropdown.change(
419
- fn=update_speaker_config_visibility,
420
- inputs=[speaker_config_method_dropdown],
421
- outputs=[global_voice_dropdown, detailed_speaker_config_input]
422
- )
423
 
424
- # --- Main Button Actions ---
425
- calculate_cost_button.click(
426
- fn=handle_calculate_cost,
427
- inputs=[script_input, tts_model_dropdown],
428
- outputs=[cost_output]
429
- )
430
-
431
  generate_button.click(
432
  fn=handle_script_processing,
433
- inputs=[
434
- script_input, tts_model_dropdown, pause_input,
435
- speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
436
- global_speed_input, global_instructions_input
437
- ],
438
- outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
439
- )
440
 
441
  gr.Markdown("## Examples")
442
- example_script_1 = "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more."
443
- example_json_1 = """
444
- [
445
- {"speaker": "Alice", "voice": "nova", "instructions": "sound curious"},
446
- {"speaker": "Bob", "voice": "echo", "speed": 0.9}
447
- ]
448
- """.strip()
449
-
450
- example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
451
- example_json_2 = """
452
- [
453
- {"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
454
- {"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
455
- ]
456
- """.strip()
457
-
458
-
459
  gr.Examples(
460
  examples=[
461
- [example_script_1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_1, 1.0, "Speak naturally."],
462
- [example_script_2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_2, 1.1, ""],
463
- ["[Narrator] A single line, using global settings.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]
464
- ],
465
- inputs=[
466
- script_input, tts_model_dropdown, pause_input,
467
- speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
468
- global_speed_input, global_instructions_input
469
- ],
470
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
471
- fn=handle_script_processing,
472
- cache_examples=False,
473
- )
474
 
475
  if __name__ == "__main__":
476
- if os.name == 'nt':
477
- asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
478
  demo.launch()
 
1
+ /// --- app.py --- ///
2
  import gradio as gr
3
  import os
4
  import asyncio
 
10
  from openai import AsyncOpenAI
11
 
12
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
13
+ from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES
14
  from utils.merge_audio import merge_mp3_files
15
 
16
  # --- Configuration ---
 
18
  NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
19
  MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
20
 
 
21
  if not OPENAI_API_KEY:
22
  try:
23
  from huggingface_hub import HfApi
 
37
  else:
38
  print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
39
 
 
40
  TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
41
+ if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE:
42
  MODEL_DEFAULT = "tts-1-hd"
43
 
 
44
  SPEAKER_CONFIG_METHODS = [
45
  "Single Voice (Global)",
46
  "Random per Speaker",
 
48
  "Detailed Configuration (JSON)"
49
  ]
50
  DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
 
 
51
  APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
 
 
 
52
  _speaker_config_cache = {}
53
 
54
  def parse_detailed_speaker_config(json_text, parsed_script_lines):
 
55
  config_map = {}
56
  default_voice = APP_AVAILABLE_VOICES[0]
57
  try:
58
  if not json_text.strip():
59
+ return {}
60
 
61
  config_list = json.loads(json_text)
62
  if not isinstance(config_list, list):
 
64
 
65
  for item in config_list:
66
  if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
 
67
  print(f"Skipping malformed item in JSON config: {item}")
68
  continue
69
  if item["voice"] not in APP_AVAILABLE_VOICES:
70
  print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
71
  item["voice"] = default_voice
72
 
 
73
  if "speed" in item:
74
  try:
75
  item["speed"] = float(item["speed"])
76
  if not (0.25 <= item["speed"] <= 4.0):
77
+ print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping.")
78
+ item["speed"] = max(0.25, min(item["speed"], 4.0))
79
  except ValueError:
80
  print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
 
81
  item.pop("speed", None)
82
 
 
83
  config_map[item["speaker"]] = {
84
  "voice": item["voice"],
85
+ "speed": item.get("speed"),
86
+ "instructions": item.get("instructions")
87
  }
88
  return config_map
89
  except json.JSONDecodeError as e:
90
  raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
91
+ except ValueError as e:
92
+ raise e
93
+ except Exception as e:
94
  raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
95
 
 
96
  def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
97
  global_selected_voice, detailed_config_map):
98
+ global _speaker_config_cache
 
 
 
99
  if _speaker_config_cache.get("__method") != speaker_config_method or \
100
  _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
101
  _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
 
108
 
109
  if speaker_config_method == "Detailed Configuration (JSON)":
110
  if speaker_name in detailed_config_map:
 
 
111
  speaker_specific = detailed_config_map[speaker_name]
112
  return {
113
  "voice": speaker_specific.get("voice", base_config["voice"]),
114
+ "speed": speaker_specific.get("speed"),
115
+ "instructions": speaker_specific.get("instructions")
116
  }
117
+ else:
 
 
118
  print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
119
  return base_config
120
 
 
 
121
  if speaker_name not in _speaker_config_cache:
122
  if speaker_config_method == "Random per Speaker":
 
123
  available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
 
124
  if not _speaker_config_cache.get("__all_assigned_random"):
125
  for i, spk_unique in enumerate(unique_script_speakers):
126
  if spk_unique not in _speaker_config_cache:
127
  _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
128
  _speaker_config_cache["__all_assigned_random"] = True
 
 
129
  if speaker_name not in _speaker_config_cache:
130
  _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
 
131
  elif speaker_config_method == "A/B Round Robin":
 
132
  if not _speaker_config_cache.get("__all_assigned_ab"):
133
  for i, spk_unique in enumerate(unique_script_speakers):
134
  if spk_unique not in _speaker_config_cache:
135
  _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
136
  _speaker_config_cache["__all_assigned_ab"] = True
137
+ if speaker_name not in _speaker_config_cache:
 
138
  speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
139
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
140
+ else:
141
  _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
142
 
 
143
  cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
144
  return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
145
 
 
146
  async def handle_script_processing(
147
+ dialogue_script: str, tts_model: str, pause_ms: int,
148
+ speaker_config_method: str, global_voice_selection: str,
149
+ detailed_speaker_json: str, global_speed: float,
150
+ global_instructions: str, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
151
  global _speaker_config_cache
152
+ _speaker_config_cache = {}
153
 
154
  if not OPENAI_API_KEY or not async_openai_client:
155
+ return None, None, "Error: OPENAI_API_KEY is not configured."
156
  if not dialogue_script.strip():
157
  return None, None, "Error: Script is empty."
158
 
 
176
  except ValueError as e:
177
  return None, None, f"Configuration Error: {str(e)}"
178
 
179
+ tasks, line_audio_files = [], [None] * len(parsed_lines)
 
 
180
  for i, line_data in enumerate(parsed_lines):
181
  speaker_name = line_data["speaker"]
 
 
 
 
 
182
  speaker_base_cfg = get_config_for_speaker(
183
  speaker_name, speaker_config_method, unique_speakers,
184
+ global_voice_selection, parsed_detailed_config_map)
 
 
185
  line_voice = speaker_base_cfg["voice"]
186
 
187
+ effective_speed = global_speed
188
+ if speaker_base_cfg.get("speed") is not None:
 
 
189
  effective_speed = speaker_base_cfg["speed"]
190
 
191
  effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
192
+ if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip():
193
  effective_instructions = speaker_base_cfg["instructions"]
194
 
195
  output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
196
  progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
197
+ tasks.append(synthesize_speech_line(
198
+ client=async_openai_client, text=line_data["text"], voice=line_voice,
199
+ output_path=output_filename, model=tts_model, speed=effective_speed,
200
+ instructions=effective_instructions, nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
201
+ line_index=line_data['id']))
 
 
 
 
 
 
 
 
 
202
 
203
  synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
 
204
  for idx, result in enumerate(synthesis_results):
205
+ if isinstance(result, Exception): print(f"Error for line {parsed_lines[idx]['id']}: {result}")
206
+ elif result is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
207
+ else: line_audio_files[idx] = result
 
 
 
208
 
209
  valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
 
210
  if not valid_audio_files:
211
  shutil.rmtree(job_audio_path_prefix)
212
+ return None, None, "Error: No audio files successfully synthesized."
213
 
214
  zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
215
  with zipfile.ZipFile(zip_filename, 'w') as zf:
216
+ for pth in valid_audio_files: zf.write(pth, os.path.basename(pth))
 
217
 
218
+ merged_mp3_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
219
+ merged_out_path = merge_mp3_files([f for f in line_audio_files if f], merged_mp3_fn, pause_ms)
220
+
221
+ status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
222
+ if len(valid_audio_files) < len(parsed_lines): status += "Some lines failed/skipped. "
223
+ if not merged_out_path and len(valid_audio_files) > 0: status += "Merged audio failed. "
224
+ elif not merged_out_path and len(valid_audio_files) == 0: status += "No audio generated."
225
+ else: status += "Outputs generated."
 
 
 
 
 
226
 
227
+ return (zip_filename if os.path.exists(zip_filename) else None,
228
+ merged_out_path if merged_out_path and os.path.exists(merged_out_path) else None,
229
+ status)
 
230
 
231
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
232
+ if not dialogue_script.strip(): return "Cost: $0.000000 (Empty script)"
 
233
  try:
234
+ parsed, chars = parse_dialogue_script(dialogue_script)
235
+ if not parsed: return "Cost: $0.000000 (No valid lines)"
236
+ cost = calculate_cost(chars, len(parsed), tts_model)
237
+ return f"Est. Cost: ${cost:.6f} ({chars} chars, {len(parsed)} lines)"
238
+ except ValueError as e: return f"Error: {str(e)}"
239
+ except Exception as e: return f"Cost calc error: {str(e)}"
 
 
 
240
 
 
241
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
242
  gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
243
+ gr.Markdown("Convert scripts with control over voices, speed, and instructions.")
 
 
244
  if not OPENAI_API_KEY or not async_openai_client:
245
+ gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set. Synthesis will fail.</h3>")
246
 
247
  with gr.Row():
248
  with gr.Column(scale=2):
249
+ script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10, info=f"Max {MAX_SCRIPT_LENGTH} chars.")
 
 
 
 
 
250
  with gr.Column(scale=1):
251
+ tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT, info="Affects controls below.")
252
+ pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50, info="Between merged lines.")
253
+ global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (tts-1/tts-1-hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
254
+ global_instructions_input = gr.Textbox(label="Global Instructions (gpt-4o-mini-tts)", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  gr.Markdown("### Speaker Configuration")
257
  with gr.Row():
258
+ speaker_config_method_dropdown = gr.Dropdown(SPEAKER_CONFIG_METHODS, label="Speaker Config Method", value=DEFAULT_SPEAKER_CONFIG_METHOD)
259
+ global_voice_dropdown = gr.Dropdown(APP_AVAILABLE_VOICES, label="Global Voice ('Single Voice' method)", value=APP_AVAILABLE_VOICES[0], visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"), interactive=True)
 
 
 
 
 
 
 
260
 
261
+ initial_json_config_example = """[
 
262
  {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},
263
  {"speaker": "Bob", "voice": "echo"},
264
  {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}
265
+ ]""".strip()
 
266
 
267
  detailed_speaker_config_input = gr.Code(
268
  label="Detailed Speaker Configuration (JSON)",
269
  language="json",
270
  lines=7,
271
+ value=initial_json_config_example,
272
+ visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
273
+ )
274
+ # New Markdown for info text, visibility tied to dropdown
275
+ detailed_config_info_md = gr.Markdown(
276
+ "<small>Define voice per speaker. Optionally, `speed` (0.25-4.0) for `tts-1`/`tts-1-hd` models, "
277
+ "and `instructions` (text) for `gpt-4o-mini-tts`.</small>",
278
+ visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)")
279
  )
280
 
281
  with gr.Row():
282
+ calculate_cost_button = gr.Button("Calculate Cost")
283
+ generate_button = gr.Button("Generate Audio", variant="primary")
284
 
285
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
 
286
  with gr.Row():
287
+ individual_lines_zip_output = gr.File(label="Download ZIP")
288
+ merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
 
289
  status_output = gr.Textbox(label="Status", interactive=False, lines=2)
290
 
291
+ def update_model_controls(selected_model):
292
+ is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
293
+ is_gpt_mini = selected_model == "gpt-4o-mini-tts"
 
294
  return {
295
+ global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
296
  global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
297
  }
298
+ tts_model_dropdown.change(fn=update_model_controls, inputs=[tts_model_dropdown], outputs=[global_speed_input, global_instructions_input])
299
 
300
+ def update_speaker_controls(method):
301
+ is_single = (method == "Single Voice (Global)")
302
+ is_detailed = (method == "Detailed Configuration (JSON)")
 
 
 
 
 
 
303
  return {
304
+ global_voice_dropdown: gr.update(visible=is_single, interactive=is_single),
305
+ detailed_speaker_config_input: gr.update(visible=is_detailed, interactive=is_detailed),
306
+ detailed_config_info_md: gr.update(visible=is_detailed) # Control visibility of new Markdown
307
  }
308
+ speaker_config_method_dropdown.change(fn=update_speaker_controls, inputs=[speaker_config_method_dropdown], outputs=[global_voice_dropdown, detailed_speaker_config_input, detailed_config_info_md])
 
 
 
 
 
309
 
310
+ calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
 
 
 
 
 
 
311
  generate_button.click(
312
  fn=handle_script_processing,
313
+ inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown,
314
+ global_voice_dropdown, detailed_speaker_config_input, global_speed_input, global_instructions_input],
315
+ outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
 
 
 
 
316
 
317
  gr.Markdown("## Examples")
318
+ ex_script1 = "[A] Hi B.\n[B] Hey A.\n[A] What's up?"
319
+ ex_json1 = '[{"speaker": "A", "voice": "nova", "instr...": "curious"}, {"speaker": "B", "voice": "echo", "speed": 0.9}]'.replace("instr...", "instructions") # Hack for brevity
320
+ ex_script2 = "[Cpt] Status?\n[Comp] Nominal. Slow.\n[Cpt] Good."
321
+ ex_json2 = '[{"speaker": "Cpt", "voice": "alloy"}, {"speaker": "Comp", "voice": "onyx", "speed": 0.8, "instr...": "robotic"}]'.replace("instr...", "instructions")
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  gr.Examples(
323
  examples=[
324
+ [ex_script1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json1, 1.0, "Speak naturally."],
325
+ [ex_script2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], ex_json2, 1.1, ""],
326
+ ["[N] Single line.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]],
327
+ inputs=[script_input, tts_model_dropdown, pause_input, speaker_config_method_dropdown, global_voice_dropdown,
328
+ detailed_speaker_config_input, global_speed_input, global_instructions_input],
 
 
 
 
329
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
330
+ fn=handle_script_processing, cache_examples=False)
 
 
331
 
332
  if __name__ == "__main__":
333
+ if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
 
334
  demo.launch()