abocha commited on
Commit
d48101f
·
1 Parent(s): d44dfc0
Files changed (6) hide show
  1. .gitignore +1 -1
  2. README.md +65 -6
  3. app.py +141 -131
  4. utils/merge_audio.py +112 -69
  5. utils/openai_tts.py +139 -83
  6. utils/script_parser.py +58 -26
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- pycache/
2
  *.pyc
3
  *.pyo
4
  *.pyd
 
1
+ __pycache__/
2
  *.pyc
3
  *.pyo
4
  *.pyd
README.md CHANGED
@@ -1,12 +1,71 @@
1
  ---
2
- title: Esl Dialogue Tts
3
- emoji: 📈
4
- colorFrom: indigo
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Dialogue TTS
3
+ emoji: 🗣️🎙️
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
  ---
10
 
11
+ # Dialogue Script to Speech Synthesis
12
+
13
+ This Hugging Face Space converts dialogue scripts into speech using OpenAI's TTS models (`tts-1`, `tts-1-hd`, `gpt-4o-mini-tts`).
14
+
15
+ ## Features
16
+
17
+ * **Input Script**: Provide a dialogue script with lines in the format `[Speaker] Utterance`.
18
+ * **TTS Models**: Choose from `tts-1`, `tts-1-hd`, or `gpt-4o-mini-tts`.
19
+ * **Voice Configuration**:
20
+ * **Single Global Voice**: Use one voice for all speakers.
21
+ * **Random per Speaker**: Assigns a unique random voice to each speaker consistently within a run.
22
+ * **A/B Round Robin**: Cycles through available voices for each unique speaker.
23
+ * **Detailed Per-Speaker UI**: Configure voice, speed (for `tts-1/hd`), and emotional vibe/custom instructions (for `gpt-4o-mini-tts`) for each speaker individually.
24
+ * **Output**:
25
+ * A ZIP file containing individual MP3s for each line.
26
+ * A single merged MP3 of the entire dialogue with custom pauses.
27
+ * **Cost Estimation**: Displays an estimated cost before generating audio.
28
+ * **NSFW Check**: Optional content safety check using an external API (if `NSFW_API_URL_TEMPLATE` is configured).
29
+
30
+ ## How to Use
31
+
32
+ 1. **Enter your dialogue script** in the text area.
33
+ Example:
34
+ ```
35
+ [Alice] Hello Bob, how are you today?
36
+ [Bob] I'm doing great, Alice! Thanks for asking.
37
+ [Narrator] And so their conversation began.
38
+ ```
39
+ 2. **Select the TTS Model**.
40
+ 3. **Set the pause duration** (in milliseconds) between lines for the merged audio.
41
+ 4. **Choose a Speaker Configuration Method**:
42
+ * If "Single Voice (Global)", select the voice.
43
+ * If "Detailed Configuration...", click "Load/Refresh Per-Speaker Settings UI" and adjust settings for each speaker.
44
+ * Other methods will apply voices automatically.
45
+ 5. (Optional) Adjust **Global Speed** or **Global Instructions** if applicable to your chosen model and configuration.
46
+ 6. Click **"Calculate Cost"** to see an estimate.
47
+ 7. Click **"Generate Audio"**.
48
+ 8. Download the ZIP file or listen to/download the merged MP3.
49
+
50
+ ## Secrets
51
+
52
+ This Space requires the following secrets to be set in the Hugging Face Space settings:
53
+
54
+ * `OPENAI_API_KEY`: Your OpenAI API key.
55
+ * `NSFW_API_URL_TEMPLATE` (Optional): URL template for NSFW checking, e.g., `https://api.example.com/check?text={text}`. The placeholder `{text}` will be URL-encoded.
56
+ * `MODEL_DEFAULT` (Optional): Default TTS model (e.g., `tts-1-hd`).
57
+
58
+ ## Smoke Test Script
59
+
60
+ Use the following script to test basic functionality:
61
+ [Gandalf] You shall not pass!
62
+ [Frodo] I will take the Ring to Mordor.
63
+ [Gandalf] So be it.
64
+
65
+ Choose your desired model and settings (e.g., "Random per Speaker"), then generate.
66
+
67
+ ## Deployment
68
+
69
+ This application is designed to be deployed as a Hugging Face Space.
70
+ Ensure `ffmpeg` is available (handled by `container.yaml` for Classic Spaces).
71
+ Set the necessary secrets in your Space settings on Hugging Face Hub.
app.py CHANGED
@@ -46,13 +46,13 @@ SPEAKER_CONFIG_METHODS = [
46
  "Single Voice (Global)",
47
  "Random per Speaker",
48
  "A/B Round Robin",
49
- "Detailed Configuration (Per Speaker UI)" # New Method Name
50
  ]
51
  DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
52
- APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
53
 
54
  PREDEFINED_VIBES = {
55
- "None": "", # No specific instruction
56
  "Calm": "Speak in a calm, composed, and relaxed manner.",
57
  "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
58
  "Happy": "Speak with a cheerful, bright, and joyful voice.",
@@ -63,7 +63,7 @@ PREDEFINED_VIBES = {
63
  "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
64
  "Authoritative": "Speak with a commanding, confident, and firm voice.",
65
  "Friendly": "Speak in a warm, approachable, and amiable manner.",
66
- "Custom...": "CUSTOM" # Special value indicating custom text should be used
67
  }
68
  VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
69
  DEFAULT_VIBE = "None"
@@ -72,45 +72,32 @@ def get_speakers_from_script(script_text):
72
  if not script_text.strip(): return []
73
  try:
74
  parsed_lines, _ = parse_dialogue_script(script_text)
75
- return sorted(list(set(p["speaker"] for p in parsed_lines)))
 
 
 
 
 
 
 
76
  except ValueError: return []
77
 
78
 
79
  def handle_dynamic_input_change(new_value, current_configs_state_dict, speaker_name, config_key, tts_model):
80
- """
81
- Updates the gr.State dictionary when a dynamic UI element changes.
82
- current_configs_state_dict is the raw dictionary from gr.State.
83
- """
84
  if speaker_name not in current_configs_state_dict:
85
  current_configs_state_dict[speaker_name] = {}
86
 
87
  current_configs_state_dict[speaker_name][config_key] = new_value
88
-
89
- # Special handling for Vibe -> Custom Instructions visibility (Simpler: custom textbox always visible)
90
- # For this iteration, custom textbox is always visible. Backend decides to use it.
91
-
92
- # Determine visibility/interactivity of speed slider for this specific speaker's UI (if we were to update it directly)
93
- # This is complex to do from a generic handler. Better to set initial visibility in load_refresh_per_speaker_ui.
94
- # Global tts_model_dropdown change will refresh the whole dynamic UI if needed for speed/instr applicability.
95
-
96
  return current_configs_state_dict
97
 
98
 
99
  def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_model):
100
- """
101
- Generates the dynamic UI components (accordions) for each speaker.
102
- Returns a list of Gradio components and the updated state.
103
- """
104
  unique_speakers = get_speakers_from_script(script_text)
105
  new_ui_components = []
106
 
107
- # Ensure state dict is not None (Gradio might pass None initially for gr.State)
108
  if current_configs_state_dict is None:
109
  current_configs_state_dict = {}
110
 
111
- # Update state for any new speakers or remove speakers no longer in script
112
- # (Optional: more complex logic could be to remove speakers from state if not in script)
113
- # For now, just add new ones with defaults if not present.
114
  for speaker_name in unique_speakers:
115
  if speaker_name not in current_configs_state_dict:
116
  current_configs_state_dict[speaker_name] = {
@@ -119,7 +106,6 @@ def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_mod
119
  "vibe": DEFAULT_VIBE,
120
  "custom_instructions": ""
121
  }
122
- # Ensure all keys exist for existing speakers (e.g., if new fields added)
123
  current_configs_state_dict[speaker_name].setdefault("voice", APP_AVAILABLE_VOICES[0])
124
  current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
125
  current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
@@ -128,42 +114,37 @@ def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_mod
128
 
129
  if not unique_speakers:
130
  new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again."))
131
- # Return current (possibly empty) state and the markdown message
132
  return new_ui_components, current_configs_state_dict
133
 
134
 
135
  for speaker_name in unique_speakers:
136
- speaker_cfg = current_configs_state_dict[speaker_name] # Should exist now
137
 
138
- # Determine if speed/instructions are applicable for the current global TTS model
139
  speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
140
- instructions_relevant = tts_model == "gpt-4o-mini-tts" # Vibe/Custom is primarily for this
141
 
142
  with gr.Accordion(label=f"Settings for: {speaker_name}", open=False) as speaker_accordion:
143
- # Voice Dropdown
144
  voice_dd = gr.Dropdown(
145
  label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg["voice"], interactive=True
146
  )
147
  voice_dd.change(
148
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
149
- inputs=[voice_dd, speaker_configs_state], # Pass the component itself and the state
150
  outputs=[speaker_configs_state]
151
  )
152
 
153
- # Speed Slider
154
  speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
155
  speed_slider = gr.Slider(
156
  label=speed_slider_label, minimum=0.25, maximum=4.0, value=speaker_cfg["speed"],
157
  step=0.05, interactive=speed_interactive
158
  )
159
- if speed_interactive: # Only attach listener if interactive
160
- speed_slider.release( # Use release to avoid too many updates during drag
161
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
162
  inputs=[speed_slider, speaker_configs_state],
163
  outputs=[speaker_configs_state]
164
  )
165
 
166
- # Vibe Dropdown
167
  vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
168
  vibe_dd = gr.Dropdown(
169
  label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg["vibe"], interactive=True
@@ -174,16 +155,15 @@ def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_mod
174
  outputs=[speaker_configs_state]
175
  )
176
 
177
- # Custom Instructions Textbox
178
  custom_instr_label = "Custom Instructions"
179
  custom_instr_placeholder = "Only used if Vibe is 'Custom...'. Overrides Vibe."
180
  custom_instr_tb = gr.Textbox(
181
  label=custom_instr_label,
182
  value=speaker_cfg["custom_instructions"],
183
  placeholder=custom_instr_placeholder,
184
- lines=2, interactive=True # Always interactive, backend logic decides if used
185
  )
186
- custom_instr_tb.input( # Use input for real-time typing updates
187
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
188
  inputs=[custom_instr_tb, speaker_configs_state],
189
  outputs=[speaker_configs_state]
@@ -196,7 +176,6 @@ def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_mod
196
  async def handle_script_processing(
197
  dialogue_script: str, tts_model: str, pause_ms: int,
198
  speaker_config_method: str, global_voice_selection: str,
199
- # No more df_value, instead we use speaker_configs_state_dict from gr.State
200
  speaker_configs_state_dict: dict,
201
  global_speed: float,
202
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
@@ -204,65 +183,65 @@ async def handle_script_processing(
204
  if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
205
  if not dialogue_script.strip(): return None, None, "Error: Script empty."
206
 
207
- job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
 
208
  if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
209
  os.makedirs(job_audio_path_prefix, exist_ok=True)
210
 
211
  try:
212
  parsed_lines, _ = parse_dialogue_script(dialogue_script)
213
- if not parsed_lines: return None, None, "Error: No valid lines."
214
- except ValueError as e: return None, None, f"Script error: {str(e)}"
 
 
 
 
215
 
216
- # Ensure state dict is usable
217
  if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  tasks, line_audio_files = [], [None] * len(parsed_lines)
220
  for i, line_data in enumerate(parsed_lines):
221
  speaker_name = line_data["speaker"]
222
 
223
- # Determine voice, speed, instructions for this line
224
- line_voice = global_voice_selection
225
  line_speed = global_speed
226
  line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
227
 
228
  if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
229
  spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
230
- line_voice = spk_cfg.get("voice", global_voice_selection) # Fallback to global if needed
231
-
232
- # Speed: per-speaker if tts-1/hd and set, else global if tts-1/hd, else API default
233
  if tts_model in ["tts-1", "tts-1-hd"]:
234
  line_speed = spk_cfg.get("speed", global_speed)
235
-
236
- # Instructions: primarily for gpt-4o-mini-tts
237
  if tts_model == "gpt-4o-mini-tts":
238
  vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
239
  custom_instr = spk_cfg.get("custom_instructions", "").strip()
240
- if vibe == "Custom..." and custom_instr:
241
- line_instructions = custom_instr
242
- elif vibe != "None" and vibe != "Custom...":
243
- line_instructions = PREDEFINED_VIBES.get(vibe, "")
244
- # If vibe is None or Custom with no text, line_instructions might remain global or become ""
245
- if not line_instructions and global_instructions and global_instructions.strip(): # Fallback to global if specific instructions are empty
246
- line_instructions = global_instructions
247
- elif not line_instructions : # Ensure it's None if truly no instruction
248
- line_instructions = None
249
-
250
-
251
- elif speaker_config_method == "Random per Speaker":
252
- # Simplified: assign random now, could be cached as before for consistency within run
253
- line_voice = random.choice(APP_AVAILABLE_VOICES)
254
- elif speaker_config_method == "A/B Round Robin":
255
- # Simplified: assign A/B now
256
- unique_script_speakers = get_speakers_from_script(dialogue_script) # Re-get for this logic
257
- speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
258
- line_voice = APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]
259
 
260
- # Fallback for speed if not tts-1/hd (API won't use it anyway)
261
- if tts_model not in ["tts-1", "tts-1-hd"]:
262
- line_speed = 1.0 # API default, won't be sent
263
 
264
  out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
265
- progress(i / len(parsed_lines), desc=f"Line {i+1}/{len(parsed_lines)} ({speaker_name})")
266
  tasks.append(synthesize_speech_line(
267
  client=async_openai_client, text=line_data["text"], voice=line_voice,
268
  output_path=out_fn, model=tts_model, speed=line_speed,
@@ -271,102 +250,125 @@ async def handle_script_processing(
271
 
272
  results = await asyncio.gather(*tasks, return_exceptions=True)
273
  for idx, res in enumerate(results):
274
- if isinstance(res, Exception): print(f"Error line {parsed_lines[idx]['id']}: {res}")
275
- elif res is None: print(f"Skipped/failed line {parsed_lines[idx]['id']}")
276
- else: line_audio_files[idx] = res
 
 
 
 
 
 
 
 
 
277
 
278
- valid_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
279
- if not valid_files:
280
- shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio synthesized."
 
281
 
282
  zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
283
- with zipfile.ZipFile(zip_fn, 'w') as zf: [zf.write(p, os.path.basename(p)) for p in valid_files]
 
 
284
 
285
  merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
286
- merged_path = merge_mp3_files([f for f in line_audio_files if f], merged_fn, pause_ms)
 
 
 
287
 
288
- status = f"{len(valid_files)}/{len(parsed_lines)} lines. "
289
- if len(valid_files) < len(parsed_lines): status += "Some failed. "
290
- if not merged_path and len(valid_files) > 0: status += "Merge failed. "
291
- elif not merged_path: status += "No audio."
292
- else: status += "Generated."
293
 
 
 
 
 
 
 
 
294
  return (zip_fn if os.path.exists(zip_fn) else None,
295
  merged_path if merged_path and os.path.exists(merged_path) else None,
296
  status)
297
 
298
 
299
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
300
- # ... (same as before) ...
301
- if not dialogue_script.strip(): return "Cost: $0.00 (Empty)"
302
  try:
303
  parsed, chars = parse_dialogue_script(dialogue_script)
304
- if not parsed: return "Cost: $0.00 (No lines)"
305
  cost = calculate_cost(chars, len(parsed), tts_model)
306
- return f"Est. Cost: ${cost:.6f}"
307
- except Exception as e: return f"Cost calc error: {str(e)}"
 
 
 
 
308
 
309
 
310
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
311
- gr.Markdown("# Dialogue Script to Speech (Dynamic Per-Speaker UI)")
312
  if not OPENAI_API_KEY or not async_openai_client:
313
- gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY not set.</h3>")
314
 
315
- # State to hold detailed speaker configurations
316
  speaker_configs_state = gr.State({})
317
 
318
  with gr.Row():
319
  with gr.Column(scale=2):
320
- script_input = gr.TextArea(label="Dialogue Script", placeholder="[S1] Hi!\n[S2] Hello!", lines=10)
321
  with gr.Column(scale=1):
322
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
323
- pause_input = gr.Number(label="Pause (ms)", value=500, minimum=0, maximum=5000, step=50)
324
- global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
325
- global_instructions_input = gr.Textbox(label="Global Instructions", placeholder="e.g., Speak calmly.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
326
 
327
- gr.Markdown("### Speaker Configuration Method")
328
  speaker_config_method_dropdown = gr.Dropdown(
329
- SPEAKER_CONFIG_METHODS, label="Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
330
  )
331
 
332
- # UI for "Single Voice (Global)"
333
  with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
334
  global_voice_dropdown = gr.Dropdown(
335
- APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0], interactive=True
336
  )
337
 
338
- # UI for "Detailed Configuration (Per Speaker UI)"
339
  with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
340
  load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
341
- gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are per-speaker.</small>")
342
- # This column will be populated by the output of load_per_speaker_ui_button
343
  dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
344
 
345
 
346
  with gr.Row():
347
- calculate_cost_button = gr.Button("Calculate Cost")
348
  generate_button = gr.Button("Generate Audio", variant="primary")
349
 
350
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
351
  with gr.Row():
352
- individual_lines_zip_output = gr.File(label="Download ZIP")
353
- merged_dialogue_mp3_output = gr.Audio(label="Merged MP3", type="filepath")
354
- status_output = gr.Textbox(label="Status", interactive=False, lines=1)
355
 
356
- # --- Event Handlers ---
357
  def update_model_controls_visibility(selected_model, script_text_for_refresh, current_speaker_configs_for_refresh):
358
- # When model changes, also refresh the dynamic UI because speed/instr applicability changes
359
- # This means load_refresh_per_speaker_ui will be called.
360
- new_dynamic_ui, updated_state = load_refresh_per_speaker_ui(script_text_for_refresh, current_speaker_configs_for_refresh, selected_model)
 
361
 
362
- is_tts1 = selected_model in ["tts-1", "tts-1-hd"]
363
- is_gpt_mini = selected_model == "gpt-4o-mini-tts"
 
364
  return {
365
- global_speed_input: gr.update(visible=is_tts1, interactive=is_tts1),
366
- global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini),
367
- dynamic_speaker_ui_area: new_dynamic_ui, # Return the actual list of components
368
  speaker_configs_state: updated_state
369
  }
 
370
  tts_model_dropdown.change(
371
  fn=update_model_controls_visibility,
372
  inputs=[tts_model_dropdown, script_input, speaker_configs_state],
@@ -376,7 +378,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
376
  def update_speaker_config_method_visibility(method):
377
  is_single = (method == "Single Voice (Global)")
378
  is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
379
- # Add more if other methods exist...
380
  return {
381
  single_voice_group: gr.update(visible=is_single),
382
  detailed_per_speaker_ui_group: gr.update(visible=is_detailed_per_speaker),
@@ -390,40 +391,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
390
  load_per_speaker_ui_button.click(
391
  fn=load_refresh_per_speaker_ui,
392
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
393
- # Output the list of components to the column, and the updated state to the state component
394
  outputs=[dynamic_speaker_ui_area, speaker_configs_state]
395
  )
396
 
397
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
398
 
399
- # Generate button now takes speaker_configs_state as input
400
  generate_button.click(
401
  fn=handle_script_processing,
402
  inputs=[
403
  script_input, tts_model_dropdown, pause_input,
404
  speaker_config_method_dropdown, global_voice_dropdown,
405
- speaker_configs_state, # Pass the state object
406
  global_speed_input, global_instructions_input
407
  ],
408
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
409
 
410
- gr.Markdown("## Examples")
 
 
 
411
  gr.Examples(
412
  examples=[
413
- ["[Alice] Hello from Alice!\n[Bob] Bob here, testing the dynamic UI.", "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", APP_AVAILABLE_VOICES[0], {}, 1.0, ""],
414
- ["[Narrator] Just one line, using global.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
 
415
  ],
416
- # Note: speaker_configs_state is passed as an empty dict {} for examples.
417
- # The user would click "Load/Refresh Per-Speaker UI" after an example loads.
418
  inputs=[
419
  script_input, tts_model_dropdown, pause_input,
420
  speaker_config_method_dropdown, global_voice_dropdown,
421
  speaker_configs_state,
422
  global_speed_input, global_instructions_input
423
  ],
 
 
 
424
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
425
- fn=handle_script_processing, cache_examples=False)
 
 
426
 
427
  if __name__ == "__main__":
428
- if os.name == 'nt': asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
429
- demo.launch(debug=True)
 
 
 
46
  "Single Voice (Global)",
47
  "Random per Speaker",
48
  "A/B Round Robin",
49
+ "Detailed Configuration (Per Speaker UI)"
50
  ]
51
  DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
52
+ APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy() # Uses the extended list from openai_tts.py
53
 
54
  PREDEFINED_VIBES = {
55
+ "None": "",
56
  "Calm": "Speak in a calm, composed, and relaxed manner.",
57
  "Excited": "Speak with an energetic, enthusiastic, and lively tone.",
58
  "Happy": "Speak with a cheerful, bright, and joyful voice.",
 
63
  "Formal": "Speak in a clear, precise, and professional tone, suitable for a formal address.",
64
  "Authoritative": "Speak with a commanding, confident, and firm voice.",
65
  "Friendly": "Speak in a warm, approachable, and amiable manner.",
66
+ "Custom...": "CUSTOM"
67
  }
68
  VIBE_CHOICES = list(PREDEFINED_VIBES.keys())
69
  DEFAULT_VIBE = "None"
 
72
  if not script_text.strip(): return []
73
  try:
74
  parsed_lines, _ = parse_dialogue_script(script_text)
75
+ # Return unique speakers in order of appearance (though order doesn't strictly matter for this use)
76
+ seen_speakers = set()
77
+ ordered_unique_speakers = []
78
+ for p in parsed_lines:
79
+ if p["speaker"] not in seen_speakers:
80
+ ordered_unique_speakers.append(p["speaker"])
81
+ seen_speakers.add(p["speaker"])
82
+ return ordered_unique_speakers
83
  except ValueError: return []
84
 
85
 
86
  def handle_dynamic_input_change(new_value, current_configs_state_dict, speaker_name, config_key, tts_model):
 
 
 
 
87
  if speaker_name not in current_configs_state_dict:
88
  current_configs_state_dict[speaker_name] = {}
89
 
90
  current_configs_state_dict[speaker_name][config_key] = new_value
 
 
 
 
 
 
 
 
91
  return current_configs_state_dict
92
 
93
 
94
  def load_refresh_per_speaker_ui(script_text, current_configs_state_dict, tts_model):
 
 
 
 
95
  unique_speakers = get_speakers_from_script(script_text)
96
  new_ui_components = []
97
 
 
98
  if current_configs_state_dict is None:
99
  current_configs_state_dict = {}
100
 
 
 
 
101
  for speaker_name in unique_speakers:
102
  if speaker_name not in current_configs_state_dict:
103
  current_configs_state_dict[speaker_name] = {
 
106
  "vibe": DEFAULT_VIBE,
107
  "custom_instructions": ""
108
  }
 
109
  current_configs_state_dict[speaker_name].setdefault("voice", APP_AVAILABLE_VOICES[0])
110
  current_configs_state_dict[speaker_name].setdefault("speed", 1.0)
111
  current_configs_state_dict[speaker_name].setdefault("vibe", DEFAULT_VIBE)
 
114
 
115
  if not unique_speakers:
116
  new_ui_components.append(gr.Markdown("No speakers detected in the script, or script is empty. Type a script and click 'Load/Refresh' again."))
 
117
  return new_ui_components, current_configs_state_dict
118
 
119
 
120
  for speaker_name in unique_speakers:
121
+ speaker_cfg = current_configs_state_dict[speaker_name]
122
 
 
123
  speed_interactive = tts_model in ["tts-1", "tts-1-hd"]
124
+ instructions_relevant = tts_model == "gpt-4o-mini-tts"
125
 
126
  with gr.Accordion(label=f"Settings for: {speaker_name}", open=False) as speaker_accordion:
 
127
  voice_dd = gr.Dropdown(
128
  label="Voice", choices=APP_AVAILABLE_VOICES, value=speaker_cfg["voice"], interactive=True
129
  )
130
  voice_dd.change(
131
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="voice", tts_model=tts_model),
132
+ inputs=[voice_dd, speaker_configs_state],
133
  outputs=[speaker_configs_state]
134
  )
135
 
 
136
  speed_slider_label = "Speech Speed" + (" (Active for tts-1/hd)" if speed_interactive else " (N/A for this model)")
137
  speed_slider = gr.Slider(
138
  label=speed_slider_label, minimum=0.25, maximum=4.0, value=speaker_cfg["speed"],
139
  step=0.05, interactive=speed_interactive
140
  )
141
+ if speed_interactive:
142
+ speed_slider.release(
143
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="speed", tts_model=tts_model),
144
  inputs=[speed_slider, speaker_configs_state],
145
  outputs=[speaker_configs_state]
146
  )
147
 
 
148
  vibe_label = "Vibe/Emotion Preset" + (" (For gpt-4o-mini-tts)" if instructions_relevant else " (Less impact on other models)")
149
  vibe_dd = gr.Dropdown(
150
  label=vibe_label, choices=VIBE_CHOICES, value=speaker_cfg["vibe"], interactive=True
 
155
  outputs=[speaker_configs_state]
156
  )
157
 
 
158
  custom_instr_label = "Custom Instructions"
159
  custom_instr_placeholder = "Only used if Vibe is 'Custom...'. Overrides Vibe."
160
  custom_instr_tb = gr.Textbox(
161
  label=custom_instr_label,
162
  value=speaker_cfg["custom_instructions"],
163
  placeholder=custom_instr_placeholder,
164
+ lines=2, interactive=True
165
  )
166
+ custom_instr_tb.input(
167
  fn=partial(handle_dynamic_input_change, speaker_name=speaker_name, config_key="custom_instructions", tts_model=tts_model),
168
  inputs=[custom_instr_tb, speaker_configs_state],
169
  outputs=[speaker_configs_state]
 
176
  async def handle_script_processing(
177
  dialogue_script: str, tts_model: str, pause_ms: int,
178
  speaker_config_method: str, global_voice_selection: str,
 
179
  speaker_configs_state_dict: dict,
180
  global_speed: float,
181
  global_instructions: str, progress=gr.Progress(track_tqdm=True)):
 
183
  if not OPENAI_API_KEY or not async_openai_client: return None, None, "Error: OPENAI_API_KEY missing."
184
  if not dialogue_script.strip(): return None, None, "Error: Script empty."
185
 
186
+ # Create a job-specific temporary directory and ensure it's clean
187
+ job_audio_path_prefix = os.path.join(tempfile.gettempdir(), f"dialogue_tts_job_{random.randint(10000, 99999)}")
188
  if os.path.exists(job_audio_path_prefix): shutil.rmtree(job_audio_path_prefix)
189
  os.makedirs(job_audio_path_prefix, exist_ok=True)
190
 
191
  try:
192
  parsed_lines, _ = parse_dialogue_script(dialogue_script)
193
+ if not parsed_lines:
194
+ shutil.rmtree(job_audio_path_prefix)
195
+ return None, None, "Error: No valid lines found in script."
196
+ except ValueError as e:
197
+ shutil.rmtree(job_audio_path_prefix)
198
+ return None, None, f"Script parsing error: {str(e)}"
199
 
 
200
  if speaker_configs_state_dict is None: speaker_configs_state_dict = {}
201
 
202
+ # --- Voice assignment map for Random and A/B per Speaker ---
203
+ speaker_voice_map = {}
204
+ if speaker_config_method in ["Random per Speaker", "A/B Round Robin"]:
205
+ unique_script_speakers_for_map = get_speakers_from_script(dialogue_script)
206
+ if speaker_config_method == "Random per Speaker":
207
+ for spk_name in unique_script_speakers_for_map:
208
+ speaker_voice_map[spk_name] = random.choice(APP_AVAILABLE_VOICES)
209
+ elif speaker_config_method == "A/B Round Robin":
210
+ for i, spk_name in enumerate(unique_script_speakers_for_map):
211
+ # Ensure APP_AVAILABLE_VOICES is not empty to prevent modulo by zero
212
+ if APP_AVAILABLE_VOICES:
213
+ speaker_voice_map[spk_name] = APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]
214
+ else: # Fallback if voice list is somehow empty
215
+ speaker_voice_map[spk_name] = "alloy" # Default OpenAI voice
216
+ # --- End voice assignment map ---
217
+
218
  tasks, line_audio_files = [], [None] * len(parsed_lines)
219
  for i, line_data in enumerate(parsed_lines):
220
  speaker_name = line_data["speaker"]
221
 
222
+ line_voice = global_voice_selection # Default for "Single Voice (Global)" or fallback
 
223
  line_speed = global_speed
224
  line_instructions = global_instructions if global_instructions and global_instructions.strip() else None
225
 
226
  if speaker_config_method == "Detailed Configuration (Per Speaker UI)":
227
  spk_cfg = speaker_configs_state_dict.get(speaker_name, {})
228
+ line_voice = spk_cfg.get("voice", global_voice_selection)
 
 
229
  if tts_model in ["tts-1", "tts-1-hd"]:
230
  line_speed = spk_cfg.get("speed", global_speed)
 
 
231
  if tts_model == "gpt-4o-mini-tts":
232
  vibe = spk_cfg.get("vibe", DEFAULT_VIBE)
233
  custom_instr = spk_cfg.get("custom_instructions", "").strip()
234
+ if vibe == "Custom..." and custom_instr: line_instructions = custom_instr
235
+ elif vibe != "None" and vibe != "Custom...": line_instructions = PREDEFINED_VIBES.get(vibe, "")
236
+ if not line_instructions and global_instructions and global_instructions.strip(): line_instructions = global_instructions
237
+ elif not line_instructions : line_instructions = None
238
+ elif speaker_config_method == "Random per Speaker" or speaker_config_method == "A/B Round Robin":
239
+ line_voice = speaker_voice_map.get(speaker_name, global_voice_selection) # Use mapped voice
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ if tts_model not in ["tts-1", "tts-1-hd"]: line_speed = 1.0
 
 
242
 
243
  out_fn = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
244
+ progress(i / len(parsed_lines), desc=f"Synthesizing: Line {i+1}/{len(parsed_lines)} ({speaker_name})")
245
  tasks.append(synthesize_speech_line(
246
  client=async_openai_client, text=line_data["text"], voice=line_voice,
247
  output_path=out_fn, model=tts_model, speed=line_speed,
 
250
 
251
  results = await asyncio.gather(*tasks, return_exceptions=True)
252
  for idx, res in enumerate(results):
253
+ if isinstance(res, Exception): print(f"Error synthesizing line {parsed_lines[idx]['id']}: {res}")
254
+ elif res is None: print(f"Skipped or failed synthesizing line {parsed_lines[idx]['id']}")
255
+ else: line_audio_files[parsed_lines[idx]['id']] = res # Store by original line ID if non-sequential
256
+
257
+ # Filter for valid, existing files, using the original parsed_lines order for merge
258
+ files_for_merge = []
259
+ for p_line in parsed_lines:
260
+ file_path = line_audio_files[p_line['id']]
261
+ if file_path and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
262
+ files_for_merge.append(file_path)
263
+ else:
264
+ files_for_merge.append(None) # Keep placeholder for correct ordering if a line failed
265
 
266
+ valid_files_for_zip = [f for f in files_for_merge if f]
267
+
268
+ if not valid_files_for_zip:
269
+ shutil.rmtree(job_audio_path_prefix); return None, None, "Error: No audio was successfully synthesized."
270
 
271
  zip_fn = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
272
+ with zipfile.ZipFile(zip_fn, 'w') as zf:
273
+ for f_path in valid_files_for_zip:
274
+ zf.write(f_path, os.path.basename(f_path))
275
 
276
  merged_fn = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
277
+ # Pass only existing files to merge_mp3_files, maintaining order
278
+ ordered_files_to_merge = [f for f in files_for_merge if f]
279
+ merged_path = merge_mp3_files(ordered_files_to_merge, merged_fn, pause_ms)
280
+
281
 
282
+ status = f"Successfully processed {len(valid_files_for_zip)} out of {len(parsed_lines)} lines. "
283
+ if len(valid_files_for_zip) < len(parsed_lines): status += "Some lines may have failed. "
284
+ if not merged_path and len(valid_files_for_zip) > 0: status += "Merging audio failed. "
285
+ elif not merged_path: status = "No audio to merge." # Overrides previous status if all failed before merge
286
+ else: status += "Merged audio generated."
287
 
288
+ # Note: job_audio_path_prefix (temp dir) is not explicitly deleted here.
289
+ # Gradio File/Audio components copy the file, so the temp dir can be cleaned
290
+ # by the OS or a cleanup routine if this Space were long-running.
291
+ # For HF Spaces, /tmp is ephemeral anyway. For robustness, could add shutil.rmtree(job_audio_path_prefix)
292
+ # after files are served, but need to ensure Gradio has finished with them.
293
+ # For now, rely on new unique dir per run and ephemeral /tmp.
294
+
295
  return (zip_fn if os.path.exists(zip_fn) else None,
296
  merged_path if merged_path and os.path.exists(merged_path) else None,
297
  status)
298
 
299
 
300
  def handle_calculate_cost(dialogue_script: str, tts_model: str):
301
+ if not dialogue_script.strip(): return "Cost: $0.00 (Script is empty)"
 
302
  try:
303
  parsed, chars = parse_dialogue_script(dialogue_script)
304
+ if not parsed: return "Cost: $0.00 (No valid lines in script)"
305
  cost = calculate_cost(chars, len(parsed), tts_model)
306
+ # Using .6f for precision, especially for char-based cost
307
+ return f"Estimated Cost for {len(parsed)} lines ({chars} chars): ${cost:.6f}"
308
+ except ValueError as e: # Catch script length error from parser
309
+ return f"Cost calculation error: {str(e)}"
310
+ except Exception as e:
311
+ return f"An unexpected error occurred during cost calculation: {str(e)}"
312
 
313
 
314
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
315
+ gr.Markdown("# Dialogue Script to Speech (OpenAI TTS)")
316
  if not OPENAI_API_KEY or not async_openai_client:
317
+ gr.Markdown("<h3 style='color:red;'>⚠️ Warning: OPENAI_API_KEY secret is not set or invalid. Audio generation will fail. Please configure it in your Space settings.</h3>")
318
 
 
319
  speaker_configs_state = gr.State({})
320
 
321
  with gr.Row():
322
  with gr.Column(scale=2):
323
+ script_input = gr.TextArea(label="Dialogue Script", placeholder="[Speaker1] Hello world!\n[Speaker2] How are you today?", lines=10)
324
  with gr.Column(scale=1):
325
  tts_model_dropdown = gr.Dropdown(TTS_MODELS_AVAILABLE, label="TTS Model", value=MODEL_DEFAULT)
326
+ pause_input = gr.Number(label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50)
327
+ global_speed_input = gr.Slider(minimum=0.25, maximum=4.0, value=1.0, step=0.05, label="Global Speed (for tts-1/hd)", visible=(MODEL_DEFAULT in ["tts-1", "tts-1-hd"]), interactive=True)
328
+ global_instructions_input = gr.Textbox(label="Global Instructions (for gpt-4o-mini-tts)", placeholder="e.g., Speak with a calm tone.", visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"), interactive=True, lines=2)
329
 
330
+ gr.Markdown("### Speaker Voice & Style Configuration")
331
  speaker_config_method_dropdown = gr.Dropdown(
332
+ SPEAKER_CONFIG_METHODS, label="Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
333
  )
334
 
 
335
  with gr.Group(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)")) as single_voice_group:
336
  global_voice_dropdown = gr.Dropdown(
337
+ APP_AVAILABLE_VOICES, label="Global Voice", value=APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", interactive=True
338
  )
339
 
 
340
  with gr.Column(visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (Per Speaker UI)")) as detailed_per_speaker_ui_group:
341
  load_per_speaker_ui_button = gr.Button("Load/Refresh Per-Speaker Settings UI (from Script Above)")
342
+ gr.Markdown("<small>Click button above to populate settings for each speaker found in the script. Settings are applied per-speaker. If script changes, click again to refresh.</small>")
 
343
  dynamic_speaker_ui_area = gr.Column(elem_id="dynamic_ui_area_for_speakers")
344
 
345
 
346
  with gr.Row():
347
+ calculate_cost_button = gr.Button("Calculate Estimated Cost")
348
  generate_button = gr.Button("Generate Audio", variant="primary")
349
 
350
  cost_output = gr.Textbox(label="Estimated Cost", interactive=False)
351
  with gr.Row():
352
+ individual_lines_zip_output = gr.File(label="Download Individual Lines (ZIP)")
353
+ merged_dialogue_mp3_output = gr.Audio(label="Play/Download Merged Dialogue (MP3)", type="filepath")
354
+ status_output = gr.Textbox(label="Status", interactive=False, lines=2, max_lines=5)
355
 
 
356
  def update_model_controls_visibility(selected_model, script_text_for_refresh, current_speaker_configs_for_refresh):
357
+ new_dynamic_ui_components, updated_state = load_refresh_per_speaker_ui(script_text_for_refresh, current_speaker_configs_for_refresh, selected_model)
358
+
359
+ is_tts1_family = selected_model in ["tts-1", "tts-1-hd"]
360
+ is_gpt_mini_tts = selected_model == "gpt-4o-mini-tts"
361
 
362
+ # It's crucial that dynamic_speaker_ui_area receives the *list* of components.
363
+ # If it's wrapped in a gr.update, it might not render correctly unless gr.update(children=...)
364
+ # Direct assignment seems to be what Gradio expects when outputting to a Column/Row that acts as a container.
365
  return {
366
+ global_speed_input: gr.update(visible=is_tts1_family, interactive=is_tts1_family),
367
+ global_instructions_input: gr.update(visible=is_gpt_mini_tts, interactive=is_gpt_mini_tts),
368
+ dynamic_speaker_ui_area: new_dynamic_ui_components,
369
  speaker_configs_state: updated_state
370
  }
371
+
372
  tts_model_dropdown.change(
373
  fn=update_model_controls_visibility,
374
  inputs=[tts_model_dropdown, script_input, speaker_configs_state],
 
378
  def update_speaker_config_method_visibility(method):
379
  is_single = (method == "Single Voice (Global)")
380
  is_detailed_per_speaker = (method == "Detailed Configuration (Per Speaker UI)")
 
381
  return {
382
  single_voice_group: gr.update(visible=is_single),
383
  detailed_per_speaker_ui_group: gr.update(visible=is_detailed_per_speaker),
 
391
  load_per_speaker_ui_button.click(
392
  fn=load_refresh_per_speaker_ui,
393
  inputs=[script_input, speaker_configs_state, tts_model_dropdown],
 
394
  outputs=[dynamic_speaker_ui_area, speaker_configs_state]
395
  )
396
 
397
  calculate_cost_button.click(fn=handle_calculate_cost, inputs=[script_input, tts_model_dropdown], outputs=[cost_output])
398
 
 
399
  generate_button.click(
400
  fn=handle_script_processing,
401
  inputs=[
402
  script_input, tts_model_dropdown, pause_input,
403
  speaker_config_method_dropdown, global_voice_dropdown,
404
+ speaker_configs_state,
405
  global_speed_input, global_instructions_input
406
  ],
407
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output])
408
 
409
+ gr.Markdown("## Example Scripts")
410
+ example_script_1 = "[Alice] Hello Bob, this is a test using the detailed configuration method.\n[Bob] Hi Alice! I'm Bob, and I'll have my own voice settings.\n[Alice] Let's see how this sounds."
411
+ example_script_2 = "[Narrator] This is a short story.\n[CharacterA] Once upon a time...\n[Narrator] ...there was a Gradio app.\n[CharacterB] And it could talk!"
412
+
413
  gr.Examples(
414
  examples=[
415
+ [example_script_1, "tts-1-hd", 300, "Detailed Configuration (Per Speaker UI)", APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", {}, 1.0, ""],
416
+ [example_script_2, "gpt-4o-mini-tts", 200, "Random per Speaker", APP_AVAILABLE_VOICES[0] if APP_AVAILABLE_VOICES else "alloy", {}, 1.0, "Speak with a gentle, storytelling voice for the narrator."],
417
+ ["[Solo] Just one line, using global voice and speed.", "tts-1", 0, "Single Voice (Global)", "fable", {}, 1.2, ""],
418
  ],
419
+ # speaker_configs_state is passed as an empty dict {} for examples.
420
+ # For "Detailed Configuration", the user should click "Load/Refresh Per-Speaker UI" after an example loads to populate the UI.
421
  inputs=[
422
  script_input, tts_model_dropdown, pause_input,
423
  speaker_config_method_dropdown, global_voice_dropdown,
424
  speaker_configs_state,
425
  global_speed_input, global_instructions_input
426
  ],
427
+ # Outputs for examples are not strictly necessary to pre-compute if cache_examples=False
428
+ # but defining them can help Gradio understand the flow.
429
+ # We can make the example click run the full processing.
430
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
431
+ fn=handle_script_processing,
432
+ cache_examples=False # Set to True if pre-computation is desired and feasible
433
+ )
434
 
435
  if __name__ == "__main__":
436
+ # Required for Windows if using asyncio with ProactorEventLoop which can be default
437
+ if os.name == 'nt':
438
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
439
+ demo.launch(debug=True) # Debug=True for development, remove for production/HF Space
utils/merge_audio.py CHANGED
@@ -4,92 +4,135 @@ import os
4
  def merge_mp3_files(file_paths, output_filename, pause_ms=500):
5
  """
6
  Merges multiple MP3 files into a single MP3 file with a specified pause
7
- between each segment.
 
 
 
 
 
 
 
8
  """
9
  if not file_paths:
 
10
  return None
11
 
12
- combined = AudioSegment.empty()
13
- pause_segment = AudioSegment.silent(duration=pause_ms) if pause_ms > 0 else AudioSegment.empty()
 
 
 
 
 
 
 
 
 
14
 
15
- for i, file_path in enumerate(file_paths):
16
- if not os.path.exists(file_path) or os.path.getsize(file_path) == 0:
17
- print(f"Warning: File {file_path} is missing or empty. Skipping.")
18
- continue
19
- try:
20
- segment = AudioSegment.from_mp3(file_path)
21
- combined += segment
22
- if i < len(file_paths) - 1: # Don't add pause after the last segment
23
- combined += pause_segment
24
- except Exception as e:
25
- print(f"Error processing file {file_path}: {e}. Skipping.")
26
- continue
27
-
28
- if len(combined) == 0:
29
  print("No valid audio segments found to merge.")
30
  return None
31
 
 
 
 
 
 
 
 
 
 
 
32
  try:
33
- combined.export(output_filename, format="mp3")
34
- return output_filename
 
 
 
 
 
 
35
  except Exception as e:
36
  print(f"Error exporting merged MP3 to {output_filename}: {e}")
37
  return None
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  if __name__ == '__main__':
40
- # Create dummy mp3 files for testing (requires ffmpeg to be installed and pydub)
41
- # This test assumes you have some small MP3s or can generate them.
42
- # For a self-contained test, you might need to generate silent MP3s.
43
 
44
- print("This script is intended to be used as a module.")
45
- print("To test, ensure you have some MP3 files and call merge_mp3_files directly.")
46
- # Example:
47
- # create_dummy_mp3("dummy1.mp3", duration_ms=1000)
48
- # create_dummy_mp3("dummy2.mp3", duration_ms=1500)
49
- # merge_mp3_files(["dummy1.mp3", "dummy2.mp3"], "merged_output.mp3", pause_ms=200)
50
- # os.remove("dummy1.mp3")
51
- # os.remove("dummy2.mp3")
52
- # os.remove("merged_output.mp3")
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Helper to create dummy files if needed for a more robust test
55
- def create_dummy_mp3(filename, duration_ms=1000):
56
- try:
57
- silence = AudioSegment.silent(duration=duration_ms)
58
- silence.export(filename, format="mp3")
59
- print(f"Created dummy file: {filename}")
60
- except Exception as e:
61
- print(f"Could not create dummy MP3 {filename}. Ensure ffmpeg is installed and accessible. Error: {e}")
 
62
 
63
 
64
- # Create dummy files for testing
65
- dummy_files_exist = True
66
- try:
67
- create_dummy_mp3("test_dummy1.mp3", 1000)
68
- create_dummy_mp3("test_dummy2.mp3", 1500)
69
- except Exception:
70
- dummy_files_exist = False
71
- print("Skipping merge test as dummy files could not be created (ffmpeg issue?).")
72
-
73
- if dummy_files_exist:
74
- print("\nTesting merge_mp3_files...")
75
- files_to_merge = ["test_dummy1.mp3", "test_dummy2.mp3", "non_existent_file.mp3"]
76
- output_merged = "test_merged_audio.mp3"
77
-
78
- result_path = merge_mp3_files(files_to_merge, output_merged, pause_ms=300)
79
- if result_path and os.path.exists(result_path):
80
- print(f"Successfully merged audio to: {result_path}")
81
- # Simple check: merged file should be larger than individual (roughly)
82
- merged_size = os.path.getsize(result_path)
83
- dummy1_size = os.path.getsize("test_dummy1.mp3")
84
- print(f"Size of {result_path}: {merged_size} bytes (dummy1 was {dummy1_size})")
85
- if merged_size > dummy1_size : # crude check
86
- print("Merge test seems OK.")
87
- else:
88
- print("Merged file size issue.")
89
- os.remove(result_path)
90
  else:
91
- print("Failed to merge audio.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Clean up dummy files
94
- if os.path.exists("test_dummy1.mp3"): os.remove("test_dummy1.mp3")
95
- if os.path.exists("test_dummy2.mp3"): os.remove("test_dummy2.mp3")
 
4
  def merge_mp3_files(file_paths, output_filename, pause_ms=500):
5
  """
6
  Merges multiple MP3 files into a single MP3 file with a specified pause
7
+ between each segment. Skips missing or empty files.
8
+ Args:
9
+ file_paths (list): A list of paths to MP3 files to merge.
10
+ Can contain None entries for files that failed synthesis; these will be skipped.
11
+ output_filename (str): The path to save the merged MP3 file.
12
+ pause_ms (int): Duration of silence in milliseconds to add between segments.
13
+ Returns:
14
+ str: The path to the merged MP3 file if successful, None otherwise.
15
  """
16
  if not file_paths:
17
+ print("Warning: No file paths provided for merging.")
18
  return None
19
 
20
+ valid_segments = []
21
+ for file_path in file_paths:
22
+ if file_path and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
23
+ try:
24
+ segment = AudioSegment.from_mp3(file_path)
25
+ valid_segments.append(segment)
26
+ except Exception as e:
27
+ print(f"Error loading audio segment from {file_path}: {e}. Skipping this file.")
28
+ elif file_path: # File path provided but file is missing or empty
29
+ print(f"Warning: File {file_path} is missing or empty. Skipping.")
30
+ # If file_path is None, it's silently skipped (already handled upstream)
31
 
32
+ if not valid_segments:
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  print("No valid audio segments found to merge.")
34
  return None
35
 
36
+ # Start with the first valid segment
37
+ combined_audio = valid_segments[0]
38
+
39
+ # Add subsequent segments with pauses
40
+ if len(valid_segments) > 1:
41
+ pause_segment = AudioSegment.silent(duration=max(0, pause_ms)) # Ensure pause_ms is not negative
42
+ for segment in valid_segments[1:]:
43
+ combined_audio += pause_segment
44
+ combined_audio += segment
45
+
46
  try:
47
+ # Export the combined audio to MP3 format
48
+ # May require ffmpeg/libav to be installed and accessible in PATH
49
+ combined_audio.export(output_filename, format="mp3")
50
+ if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
51
+ return output_filename
52
+ else:
53
+ print(f"Error: Merged file {output_filename} was not created or is empty after export.")
54
+ return None
55
  except Exception as e:
56
  print(f"Error exporting merged MP3 to {output_filename}: {e}")
57
  return None
58
 
59
+ # Helper function to create dummy MP3 files for testing (requires pydub and ffmpeg)
60
+ def _create_dummy_mp3(filename, duration_ms=1000, text_for_log="dummy"):
61
+ try:
62
+ # Create a silent audio segment
63
+ silence = AudioSegment.silent(duration=duration_ms)
64
+ # Export it as an MP3 file
65
+ silence.export(filename, format="mp3")
66
+ print(f"Successfully created dummy MP3: {filename} (duration: {duration_ms}ms) for '{text_for_log}'")
67
+ return True
68
+ except Exception as e:
69
+ print(f"Could not create dummy MP3 '{filename}'. Ensure ffmpeg is installed and accessible. Error: {e}")
70
+ return False
71
+
72
  if __name__ == '__main__':
73
+ print("--- Testing merge_mp3_files ---")
 
 
74
 
75
+ test_output_dir = "test_audio_merge_output"
76
+ os.makedirs(test_output_dir, exist_ok=True)
77
+
78
+ dummy_files = []
79
+ # Create some dummy MP3 files for the test
80
+ if _create_dummy_mp3(os.path.join(test_output_dir, "dummy1.mp3"), 1000, "Segment 1"):
81
+ dummy_files.append(os.path.join(test_output_dir, "dummy1.mp3"))
82
+ if _create_dummy_mp3(os.path.join(test_output_dir, "dummy2.mp3"), 1500, "Segment 2"):
83
+ dummy_files.append(os.path.join(test_output_dir, "dummy2.mp3"))
84
+
85
+ # Test case 1: Merge existing files
86
+ if len(dummy_files) == 2:
87
+ output_merged_1 = os.path.join(test_output_dir, "merged_test1.mp3")
88
+ print(f"\nAttempting to merge: {dummy_files} with 300ms pause into {output_merged_1}")
89
+ result_path_1 = merge_mp3_files(dummy_files, output_merged_1, pause_ms=300)
90
+ if result_path_1 and os.path.exists(result_path_1):
91
+ print(f"SUCCESS: Merged audio created at: {result_path_1} (Size: {os.path.getsize(result_path_1)} bytes)")
92
+ else:
93
+ print(f"FAILURE: Merging test case 1 failed.")
94
+ else:
95
+ print("\nSkipping merge test case 1 due to failure in creating dummy files.")
96
 
97
+ # Test case 2: Include a non-existent file and a None entry
98
+ files_with_issues = [
99
+ dummy_files[0] if dummy_files else None,
100
+ os.path.join(test_output_dir, "non_existent_file.mp3"),
101
+ None, # Representing a failed synthesis
102
+ dummy_files[1] if len(dummy_files) > 1 else None
103
+ ]
104
+ # Filter out None from the list if dummy files weren't created
105
+ files_with_issues_filtered = [f for f in files_with_issues if f is not None or isinstance(f, str)]
106
 
107
 
108
+ if any(f for f in files_with_issues_filtered if f and os.path.exists(f)): # Proceed if at least one valid dummy file exists
109
+ output_merged_2 = os.path.join(test_output_dir, "merged_test2_with_issues.mp3")
110
+ print(f"\nAttempting to merge: {files_with_issues_filtered} with 500ms pause into {output_merged_2}")
111
+ result_path_2 = merge_mp3_files(files_with_issues_filtered, output_merged_2, pause_ms=500)
112
+ if result_path_2 and os.path.exists(result_path_2):
113
+ print(f"SUCCESS (with skips): Merged audio created at: {result_path_2} (Size: {os.path.getsize(result_path_2)} bytes)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  else:
115
+ print(f"NOTE: Merging test case 2 might result in fewer segments or failure if no valid files remained.")
116
+ else:
117
+ print("\nSkipping merge test case 2 as no valid dummy files were available.")
118
+
119
+ # Test case 3: Empty list of files
120
+ output_merged_3 = os.path.join(test_output_dir, "merged_test3_empty.mp3")
121
+ print(f"\nAttempting to merge an empty list of files into {output_merged_3}")
122
+ result_path_3 = merge_mp3_files([], output_merged_3, pause_ms=100)
123
+ if result_path_3 is None:
124
+ print("SUCCESS: Correctly handled empty file list (returned None).")
125
+ else:
126
+ print(f"FAILURE: Expected None for empty file list, got {result_path_3}")
127
+
128
+ # Test case 4: List with only None or invalid paths
129
+ output_merged_4 = os.path.join(test_output_dir, "merged_test4_all_invalid.mp3")
130
+ print(f"\nAttempting to merge list with only invalid/None files into {output_merged_4}")
131
+ result_path_4 = merge_mp3_files([None, "non_existent.mp3"], output_merged_4, pause_ms=100)
132
+ if result_path_4 is None:
133
+ print("SUCCESS: Correctly handled list with only invalid/None files (returned None).")
134
+ else:
135
+ print(f"FAILURE: Expected None for all-invalid list, got {result_path_4}")
136
 
137
+ print(f"\nTest finished. Check ./{test_output_dir}/ for any generated files.")
138
+ # You might want to add shutil.rmtree(test_output_dir) here for cleanup after visual inspection.
 
utils/openai_tts.py CHANGED
@@ -3,49 +3,64 @@ import os
3
  import time
4
  from openai import AsyncOpenAI, OpenAIError, RateLimitError
5
  import httpx # For NSFW check
 
6
 
7
- # Expanded list of voices based on recent OpenAI documentation
8
- OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer', 'ash', 'ballad', 'coral', 'sage', 'verse']
 
 
 
9
 
10
- # Concurrency limiter
11
  MAX_CONCURRENT_REQUESTS = 2
12
  semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
13
 
14
- # Retry mechanism
15
  MAX_RETRIES = 3
16
- INITIAL_BACKOFF_SECONDS = 1
 
17
 
18
  async def is_content_safe(text: str, api_url_template: str | None) -> bool:
19
  """
20
  Checks if the content is safe using an external NSFW API.
21
- Returns True if safe or if API URL is not provided, False if unsafe.
 
22
  """
23
  if not api_url_template:
24
- return True
25
 
26
  if "{text}" not in api_url_template:
27
- print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
28
- return True
29
 
30
  try:
31
- encoded_text = httpx.utils.quote(text)
32
- url = api_url_template.format(text=encoded_text)
33
 
34
- async with httpx.AsyncClient() as client:
35
- response = await client.get(url, timeout=10.0)
 
36
 
37
- if response.status_code == 200:
38
- return True
39
- else:
40
- print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
41
- return False
 
 
 
 
 
 
 
 
 
42
  except httpx.RequestError as e:
43
- print(f"NSFW Check: API request error: {e}")
44
- return False
45
  except Exception as e:
46
  print(f"NSFW Check: An unexpected error occurred: {e}")
47
- return False
48
-
49
 
50
  async def synthesize_speech_line(
51
  client: AsyncOpenAI,
@@ -53,116 +68,157 @@ async def synthesize_speech_line(
53
  voice: str,
54
  output_path: str,
55
  model: str = "tts-1-hd",
56
- speed: float = 1.0,
57
- instructions: str | None = None,
58
  nsfw_api_url_template: str | None = None,
59
- line_index: int = -1
60
  ) -> str | None:
61
  """
62
  Synthesizes a single line of text to speech using OpenAI TTS.
63
- Includes speed and instructions parameters based on model compatibility.
64
- Retries on RateLimitError with exponential backoff.
65
  Returns the output_path if successful, None otherwise.
66
  """
 
 
 
 
67
  if nsfw_api_url_template:
68
  if not await is_content_safe(text, nsfw_api_url_template):
69
- print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
70
- return None
71
 
72
  current_retry = 0
73
  backoff_seconds = INITIAL_BACKOFF_SECONDS
74
 
 
75
  async with semaphore:
76
- while current_retry < MAX_RETRIES:
77
  try:
78
  request_params = {
79
  "model": model,
80
- "voice": voice,
81
  "input": text,
82
- "response_format": "mp3"
 
83
  }
84
 
85
- # Add speed if model supports it and speed is not default
86
  if model in ["tts-1", "tts-1-hd"]:
87
- if speed is not None and speed != 1.0: # OpenAI default is 1.0
88
- # Ensure speed is within valid range for safety, though UI should also constrain this
89
- clamped_speed = max(0.25, min(speed, 4.0))
90
- request_params["speed"] = clamped_speed
 
91
 
92
- # Add instructions if model supports it and instructions are provided
93
- # Assuming gpt-4o-mini-tts supports it, and tts-1/tts-1-hd do not.
94
- if model not in ["tts-1", "tts-1-hd"] and instructions: # Example: gpt-4o-mini-tts
95
- request_params["instructions"] = instructions
 
 
 
96
 
97
  response = await client.audio.speech.create(**request_params)
 
 
98
  await response.astream_to_file(output_path)
99
- return output_path
 
 
 
 
 
 
 
100
  except RateLimitError as e:
101
  current_retry += 1
102
- if current_retry >= MAX_RETRIES:
103
- print(f"Line {line_index if line_index != -1 else ''}: Max retries reached for RateLimitError. Error: {e}")
104
  return None
105
- print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
 
 
106
  await asyncio.sleep(backoff_seconds)
107
- backoff_seconds *= 2
108
- except OpenAIError as e:
109
- print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
 
110
  return None
111
- except Exception as e:
112
- print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
113
- return None
114
- return None
 
 
 
 
 
 
 
 
115
 
116
  if __name__ == '__main__':
117
  async def main_test():
118
  api_key = os.getenv("OPENAI_API_KEY")
119
  if not api_key:
120
- print("OPENAI_API_KEY not set. Skipping test.")
121
  return
122
 
 
 
 
 
123
  client = AsyncOpenAI(api_key=api_key)
124
 
125
- test_lines = [
126
- {"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice, spoken quickly."},
127
- {"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice with instructions.", "instructions": "Speak in a deep, resonant voice."},
128
- {"id": 2, "speaker": "Alice", "text": "A short reply, spoken slowly.", "speed": 0.8},
129
- {"id": 3, "speaker": "Charlie", "text": "Charlie here, normal speed."}
 
 
 
 
130
  ]
131
 
132
- temp_dir = "test_audio_output_enhanced"
133
- os.makedirs(temp_dir, exist_ok=True)
 
134
 
135
- tasks = []
136
- for i, line_data in enumerate(test_lines):
137
- # Test with specific models to check param compatibility
138
- # For Alice (speed): tts-1-hd. For Bob (instructions): gpt-4o-mini-tts
139
- current_model = "tts-1-hd"
140
- if "instructions" in line_data:
141
- current_model = "gpt-4o-mini-tts" # Example, ensure this model is available for your key
142
-
143
- voice = OPENAI_VOICES[i % len(OPENAI_VOICES)]
144
- output_file = os.path.join(temp_dir, f"line_{line_data['id']}_{current_model}.mp3")
145
 
146
- tasks.append(
 
 
147
  synthesize_speech_line(
148
- client,
149
- line_data["text"],
150
- voice,
151
- output_file,
152
- model=current_model,
153
- speed=line_data.get("speed", 1.0),
154
  instructions=line_data.get("instructions"),
 
155
  line_index=line_data['id']
156
  )
157
  )
158
 
159
- results = await asyncio.gather(*tasks)
 
 
 
 
 
 
 
 
 
160
 
161
- successful_files = [r for r in results if r]
162
- print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
163
- for f_path in successful_files:
164
- print(f" - {f_path}")
165
 
166
- if os.name == 'nt':
 
167
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
168
  asyncio.run(main_test())
 
3
  import time
4
  from openai import AsyncOpenAI, OpenAIError, RateLimitError
5
  import httpx # For NSFW check
6
+ import urllib.parse # For URL encoding text in NSFW check
7
 
8
+ # Voices available for OpenAI TTS models (tts-1, tts-1-hd, gpt-4o-mini-tts)
9
+ # As of May 2024, these are the primary voices. Ash, Ballad, Coral, Sage, Verse were mentioned for GPT-4o's voice capabilities.
10
+ OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
11
+ # If gpt-4o-mini-tts explicitly supports more/different voices, this list might need adjustment
12
+ # or the app could query available voices if an API endpoint for that exists. For now, assume these are common.
13
 
14
+ # Concurrency limiter for OpenAI API calls
15
  MAX_CONCURRENT_REQUESTS = 2
16
  semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)
17
 
18
+ # Retry mechanism parameters
19
  MAX_RETRIES = 3
20
+ INITIAL_BACKOFF_SECONDS = 1.0 # Start with 1 second
21
+ MAX_BACKOFF_SECONDS = 16.0 # Cap backoff to avoid excessively long waits
22
 
23
  async def is_content_safe(text: str, api_url_template: str | None) -> bool:
24
  """
25
  Checks if the content is safe using an external NSFW API.
26
+ Returns True if safe, API URL is not provided, or check fails open.
27
+ Returns False if content is flagged as unsafe by the API.
28
  """
29
  if not api_url_template:
30
+ return True # No NSFW check configured, assume safe
31
 
32
  if "{text}" not in api_url_template:
33
+ print(f"Warning: NSFW_API_URL_TEMPLATE ('{api_url_template}') does not contain {{text}} placeholder. Skipping NSFW check.")
34
+ return True # Configuration error, fail open (assume safe)
35
 
36
  try:
37
+ encoded_text = urllib.parse.quote(text) # Ensure text is URL-safe
38
+ url = api_url_template.replace("{text}", encoded_text) # Use replace for simplicity
39
 
40
+ # Using a timeout for the external API call
41
+ async with httpx.AsyncClient(timeout=10.0) as client:
42
+ response = await client.get(url)
43
 
44
+ response.raise_for_status() # Will raise an exception for 4xx/5xx responses
45
+
46
+ # Assuming the API returns a specific response to indicate safety.
47
+ # This part needs to be adapted to the actual API's response format.
48
+ # For example, if it returns JSON: `data = response.json()`
49
+ # If it returns 200 for safe, and non-200 for unsafe, raise_for_status handles it.
50
+ # For this placeholder, we'll assume 200 means safe.
51
+ return True # Content is safe based on API response
52
+
53
+ except httpx.HTTPStatusError as e:
54
+ # Log specific HTTP errors from the NSFW API
55
+ print(f"NSFW Check: API request failed. Status: {e.response.status_code}. URL: {e.request.url}. Response: {e.response.text[:200]}")
56
+ # Depending on policy, you might "fail closed" (treat as unsafe) or "fail open"
57
+ return False # Content flagged as unsafe or API error
58
  except httpx.RequestError as e:
59
+ print(f"NSFW Check: API request error: {e}. URL: {e.request.url if e.request else 'N/A'}")
60
+ return True # Fail open (assume safe) on network/request errors to not block TTS
61
  except Exception as e:
62
  print(f"NSFW Check: An unexpected error occurred: {e}")
63
+ return True # Fail open (assume safe) on other unexpected errors
 
64
 
65
  async def synthesize_speech_line(
66
  client: AsyncOpenAI,
 
68
  voice: str,
69
  output_path: str,
70
  model: str = "tts-1-hd",
71
+ speed: float = 1.0, # Speed parameter (0.25 to 4.0). Default 1.0.
72
+ instructions: str | None = None, # For models like gpt-4o-mini-tts potentially
73
  nsfw_api_url_template: str | None = None,
74
+ line_index: int = -1 # For logging purposes
75
  ) -> str | None:
76
  """
77
  Synthesizes a single line of text to speech using OpenAI TTS.
78
+ Handles rate limiting with exponential backoff and NSFW checks.
 
79
  Returns the output_path if successful, None otherwise.
80
  """
81
+ if not text.strip():
82
+ print(f"Line {line_index if line_index != -1 else '(unknown)'}: Input text is empty. Skipping synthesis.")
83
+ return None
84
+
85
  if nsfw_api_url_template:
86
  if not await is_content_safe(text, nsfw_api_url_template):
87
+ print(f"Line {line_index if line_index != -1 else '(unknown)'}: Content flagged as potentially unsafe. Skipping synthesis.")
88
+ return None # Skip synthesis for flagged content
89
 
90
  current_retry = 0
91
  backoff_seconds = INITIAL_BACKOFF_SECONDS
92
 
93
+ # Acquire semaphore before entering retry loop
94
  async with semaphore:
95
+ while current_retry <= MAX_RETRIES:
96
  try:
97
  request_params = {
98
  "model": model,
 
99
  "input": text,
100
+ "voice": voice,
101
+ "response_format": "mp3" # Explicitly request mp3
102
  }
103
 
104
+ # Add speed if model is tts-1 or tts-1-hd and speed is not default 1.0
105
  if model in ["tts-1", "tts-1-hd"]:
106
+ # OpenAI API speed range is 0.25 to 4.0.
107
+ # Clamp speed to be safe, although UI should also enforce this.
108
+ clamped_speed = max(0.25, min(float(speed), 4.0))
109
+ if clamped_speed != 1.0: # Only send if not default
110
+ request_params["speed"] = clamped_speed
111
 
112
+ # Add instructions if provided and model is gpt-4o-mini-tts (or other future models supporting it)
113
+ # tts-1 and tts-1-hd do not support an 'instructions' parameter.
114
+ if model == "gpt-4o-mini-tts" and instructions and instructions.strip():
115
+ request_params["instructions"] = instructions.strip()
116
+
117
+ # Log the request params being sent (excluding sensitive parts like full text if too long)
118
+ # print(f"Line {line_index}: Sending request to OpenAI TTS with params: {{'model': '{model}', 'voice': '{voice}', 'speed': {request_params.get('speed', 1.0)}, 'has_instructions': {bool(request_params.get('instructions'))}}}")
119
 
120
  response = await client.audio.speech.create(**request_params)
121
+
122
+ # Stream response to file
123
  await response.astream_to_file(output_path)
124
+
125
+ # Verify file was created and has content
126
+ if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
127
+ return output_path
128
+ else:
129
+ print(f"Line {line_index if line_index != -1 else ''}: Synthesis appeared to succeed but output file is missing or empty: {output_path}")
130
+ return None # File not created or empty
131
+
132
  except RateLimitError as e:
133
  current_retry += 1
134
+ if current_retry > MAX_RETRIES:
135
+ print(f"Line {line_index if line_index != -1 else ''}: Max retries reached due to RateLimitError. Error: {e}")
136
  return None
137
+
138
+ # Exponential backoff with jitter could be added, but simple exponential for now
139
+ print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit (Attempt {current_retry}/{MAX_RETRIES}). Retrying in {backoff_seconds:.2f}s...")
140
  await asyncio.sleep(backoff_seconds)
141
+ backoff_seconds = min(backoff_seconds * 2, MAX_BACKOFF_SECONDS) # Increase backoff, cap at max
142
+
143
+ except OpenAIError as e: # Catch other specific OpenAI errors
144
+ print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error during synthesis: {type(e).__name__} - {e}")
145
  return None
146
+
147
+ except Exception as e: # Catch any other unexpected errors
148
+ print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {type(e).__name__} - {e}")
149
+ # current_retry += 1 # Could also retry on generic errors if deemed transient
150
+ # if current_retry > MAX_RETRIES: return None
151
+ # await asyncio.sleep(backoff_seconds)
152
+ # backoff_seconds = min(backoff_seconds * 2, MAX_BACKOFF_SECONDS)
153
+ return None # For most unexpected errors, safer not to retry indefinitely
154
+
155
+ # If loop finishes due to max retries without returning output_path
156
+ print(f"Line {line_index if line_index != -1 else ''}: Failed to synthesize after all retries or due to non-retryable error.")
157
+ return None
158
 
159
  if __name__ == '__main__':
160
  async def main_test():
161
  api_key = os.getenv("OPENAI_API_KEY")
162
  if not api_key:
163
+ print("OPENAI_API_KEY environment variable not set. Skipping test.")
164
  return
165
 
166
+ # Test with a mock NSFW API template
167
+ # Replace with a real one if you have one, or set to None to disable
168
+ mock_nsfw_template = "https://api.example.com/nsfw_check?text={text}" # This will likely fail open
169
+
170
  client = AsyncOpenAI(api_key=api_key)
171
 
172
+ test_lines_data = [
173
+ {"id": 0, "text": "Hello from Alloy, this is a test of standard tts-1-hd.", "voice": "alloy", "model": "tts-1-hd", "speed": 1.0},
174
+ {"id": 1, "text": "Echo here, speaking a bit faster.", "voice": "echo", "model": "tts-1-hd", "speed": 1.3},
175
+ {"id": 2, "text": "Fable, narrating slowly and calmly.", "voice": "fable", "model": "tts-1", "speed": 0.8},
176
+ {"id": 3, "text": "This is Onyx with instructions for gpt-4o-mini-tts: speak with a deep, commanding voice.", "voice": "onyx", "model": "gpt-4o-mini-tts", "instructions": "Speak with a very deep, commanding and slightly robotic voice."},
177
+ {"id": 4, "text": "Nova, testing default speed with tts-1.", "voice": "nova", "model": "tts-1"},
178
+ {"id": 5, "text": "Shimmer testing gpt-4o-mini-tts without specific instructions.", "voice": "shimmer", "model": "gpt-4o-mini-tts"},
179
+ {"id": 6, "text": "This line contains potentially naughty words that might be flagged.", "voice": "alloy", "model": "tts-1-hd", "nsfw_check": True}, # Test NSFW
180
+ {"id": 7, "text": "", "voice": "echo", "model": "tts-1"}, # Test empty text
181
  ]
182
 
183
+ temp_output_dir = "test_audio_output_openai_tts"
184
+ os.makedirs(temp_output_dir, exist_ok=True)
185
+ print(f"Test audio will be saved in ./{temp_output_dir}/")
186
 
187
+ synthesis_tasks = []
188
+ for line_data in test_lines_data:
189
+ output_file_path = os.path.join(temp_output_dir, f"line_{line_data['id']}_{line_data['voice']}_{line_data['model']}.mp3")
 
 
 
 
 
 
 
190
 
191
+ nsfw_url = mock_nsfw_template if line_data.get("nsfw_check") else None
192
+
193
+ synthesis_tasks.append(
194
  synthesize_speech_line(
195
+ client=client,
196
+ text=line_data["text"],
197
+ voice=line_data["voice"],
198
+ output_path=output_file_path,
199
+ model=line_data["model"],
200
+ speed=line_data.get("speed", 1.0), # Default speed if not specified
201
  instructions=line_data.get("instructions"),
202
+ nsfw_api_url_template=nsfw_url,
203
  line_index=line_data['id']
204
  )
205
  )
206
 
207
+ results = await asyncio.gather(*synthesis_tasks)
208
+
209
+ successful_files_count = 0
210
+ print("\n--- Test Synthesis Results ---")
211
+ for i, result_path in enumerate(results):
212
+ if result_path and os.path.exists(result_path):
213
+ print(f"SUCCESS: Line {test_lines_data[i]['id']} -> {result_path} (Size: {os.path.getsize(result_path)} bytes)")
214
+ successful_files_count += 1
215
+ else:
216
+ print(f"FAILURE or SKIP: Line {test_lines_data[i]['id']} (Text: '{test_lines_data[i]['text'][:30]}...')")
217
 
218
+ print(f"\nSuccessfully synthesized {successful_files_count} out of {len(test_lines_data)} lines.")
219
+ print(f"Please check the ./{temp_output_dir}/ directory for output files.")
 
 
220
 
221
+ # Run the async main function
222
+ if os.name == 'nt': # Required for Windows asyncio selector policy
223
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
224
  asyncio.run(main_test())
utils/script_parser.py CHANGED
@@ -2,15 +2,15 @@ import re
2
  import math
3
 
4
  MAX_SCRIPT_LENGTH = 10000 # characters
5
- TTS_1_HD_COST_PER_CHAR = 0.00003 # $30 / 1M chars
6
- GPT_4O_MINI_TTS_COST_PER_SECOND = 0.015 / 60 # $0.015 / minute
7
- CHARS_PER_SECOND_ESTIMATE = 10 # Rough estimate for TTS duration
8
 
9
  def parse_dialogue_script(script_text):
10
  """
11
- Parses a dialogue script into a list of (index, speaker, utterance) tuples.
12
  Input format: "[Speaker] Utterance" per line.
13
- Lines not matching the format are attempted to be parsed as "[Default] Utterance".
14
  """
15
  lines = script_text.strip().split('\n')
16
  parsed_lines = []
@@ -22,22 +22,24 @@ def parse_dialogue_script(script_text):
22
  for i, line_content in enumerate(lines):
23
  line_content = line_content.strip()
24
  if not line_content:
25
- continue
26
 
27
  match = re.match(r'\[(.*?)\]\s*(.*)', line_content)
28
  if match:
29
  speaker, utterance = match.groups()
 
30
  utterance = utterance.strip()
 
 
31
  else:
32
- # If no speaker tag, assign a default speaker or handle as per requirements
33
- # For now, let's assume the whole line is an utterance by a "Narrator" or similar
34
- speaker = "Narrator" # Or consider raising an error/warning
35
- utterance = line_content.strip()
36
 
37
- if not utterance: # Skip if utterance is empty after parsing
38
  continue
39
 
40
- parsed_lines.append({"id": i, "speaker": speaker.strip(), "text": utterance})
41
  total_chars += len(utterance)
42
 
43
  return parsed_lines, total_chars
@@ -46,38 +48,68 @@ def calculate_cost(total_chars, num_lines, model_name="tts-1-hd"):
46
  """
47
  Calculates the estimated cost for TTS processing.
48
  """
49
- if model_name == "tts-1-hd":
50
- cost = total_chars * TTS_1_HD_COST_PER_CHAR
 
51
  elif model_name == "gpt-4o-mini-tts":
52
- # Estimate duration: total_chars / X chars per second
53
- # This is a very rough estimate. Actual duration depends on OpenAI's model.
54
- estimated_seconds = total_chars / CHARS_PER_SECOND_ESTIMATE
 
 
 
 
 
 
 
 
 
55
  cost = estimated_seconds * GPT_4O_MINI_TTS_COST_PER_SECOND
56
- else:
57
- raise ValueError(f"Unknown model for cost calculation: {model_name}")
 
58
  return cost
59
 
60
  if __name__ == '__main__':
61
- sample_script = """
62
  [Alice] Hello Bob, how are you?
63
  [Bob] I'm fine, Alice. And you?
64
  This is a line without a speaker tag.
65
  [Charlie] Just listening in.
 
 
66
  """
67
- parsed, chars = parse_dialogue_script(sample_script)
 
68
  print("Parsed Lines:")
69
  for p_line in parsed:
70
  print(p_line)
71
- print(f"\nTotal Characters: {chars}")
72
 
73
  cost_hd = calculate_cost(chars, len(parsed), "tts-1-hd")
74
  print(f"Estimated cost for tts-1-hd: ${cost_hd:.6f}")
75
 
 
 
 
 
76
  cost_gpt_mini = calculate_cost(chars, len(parsed), "gpt-4o-mini-tts")
77
- print(f"Estimated cost for gpt-4o-mini-tts: ${cost_gpt_mini:.6f}")
 
 
 
 
 
 
78
 
79
- long_script = "a" * (MAX_SCRIPT_LENGTH + 1)
80
  try:
81
- parse_dialogue_script(long_script)
 
82
  except ValueError as e:
83
- print(f"Error for long script: {e}")
 
 
 
 
 
 
 
2
  import math
3
 
4
  MAX_SCRIPT_LENGTH = 10000 # characters
5
+ TTS_1_HD_COST_PER_CHAR = 0.00003 # $30 / 1M chars for tts-1-hd and tts-1
6
+ GPT_4O_MINI_TTS_COST_PER_SECOND = 0.015 / 60 # $0.015 / minute for gpt-4o-mini-tts
7
+ CHARS_PER_SECOND_ESTIMATE = 12 # Average characters spoken per second, for estimation
8
 
9
  def parse_dialogue_script(script_text):
10
  """
11
+ Parses a dialogue script into a list of dictionaries, each representing a line.
12
  Input format: "[Speaker] Utterance" per line.
13
+ Lines not matching the format are assigned to a "Narrator" speaker.
14
  """
15
  lines = script_text.strip().split('\n')
16
  parsed_lines = []
 
22
  for i, line_content in enumerate(lines):
23
  line_content = line_content.strip()
24
  if not line_content:
25
+ continue # Skip empty lines
26
 
27
  match = re.match(r'\[(.*?)\]\s*(.*)', line_content)
28
  if match:
29
  speaker, utterance = match.groups()
30
+ speaker = speaker.strip()
31
  utterance = utterance.strip()
32
+ if not speaker: # If speaker tag is empty like "[] Text"
33
+ speaker = "UnknownSpeaker"
34
  else:
35
+ # If no speaker tag, assign the whole line as utterance by "Narrator"
36
+ speaker = "Narrator"
37
+ utterance = line_content # Already stripped
 
38
 
39
+ if not utterance: # Skip if utterance is empty after parsing (e.g. "[Speaker]" with no text)
40
  continue
41
 
42
+ parsed_lines.append({"id": i, "speaker": speaker, "text": utterance})
43
  total_chars += len(utterance)
44
 
45
  return parsed_lines, total_chars
 
48
  """
49
  Calculates the estimated cost for TTS processing.
50
  """
51
+ cost = 0.0
52
+ if model_name in ["tts-1", "tts-1-hd"]: # OpenAI charges same for tts-1 and tts-1-hd
53
+ cost = total_chars * TTS_1_HD_COST_PER_CHAR
54
  elif model_name == "gpt-4o-mini-tts":
55
+ # Estimate duration: total_chars / X chars per second. This is a rough estimate.
56
+ # OpenAI pricing for gpt-4o-mini's TTS is by character, similar to tts-1.
57
+ # As of latest check, gpt-4o-mini is priced same as tts-1.
58
+ # $0.000015 / char ($15.00 / 1M characters)
59
+ # Let's update cost for gpt-4o-mini-tts if it differs.
60
+ # The prompt says: "# seconds × $0.015 for gpt‑4o‑mini‑tts (0.015 USD / minute)"
61
+ # This conflicts with OpenAI's typical character-based TTS pricing.
62
+ # Assuming prompt's per-second pricing is the requirement for gpt-4o-mini-tts for this exercise.
63
+ if CHARS_PER_SECOND_ESTIMATE <= 0: # Avoid division by zero
64
+ estimated_seconds = total_chars / 10.0 # Fallback chars/sec
65
+ else:
66
+ estimated_seconds = total_chars / CHARS_PER_SECOND_ESTIMATE
67
  cost = estimated_seconds * GPT_4O_MINI_TTS_COST_PER_SECOND
68
+ else: # Fallback to character-based costing for any other tts-1 like model
69
+ cost = total_chars * TTS_1_HD_COST_PER_CHAR
70
+ # raise ValueError(f"Unknown model for cost calculation: {model_name}") # Or assume default if model not matched
71
  return cost
72
 
73
  if __name__ == '__main__':
74
+ sample_script_1 = """
75
  [Alice] Hello Bob, how are you?
76
  [Bob] I'm fine, Alice. And you?
77
  This is a line without a speaker tag.
78
  [Charlie] Just listening in.
79
+ [] This line has an empty speaker tag.
80
+ [EmptySpeakerText]
81
  """
82
+ print(f"--- Test Case 1: Mixed Script ---")
83
+ parsed, chars = parse_dialogue_script(sample_script_1)
84
  print("Parsed Lines:")
85
  for p_line in parsed:
86
  print(p_line)
87
+ print(f"\nTotal Characters for TTS: {chars}")
88
 
89
  cost_hd = calculate_cost(chars, len(parsed), "tts-1-hd")
90
  print(f"Estimated cost for tts-1-hd: ${cost_hd:.6f}")
91
 
92
+ cost_tts1 = calculate_cost(chars, len(parsed), "tts-1")
93
+ print(f"Estimated cost for tts-1: ${cost_tts1:.6f}")
94
+
95
+ # Test cost for gpt-4o-mini-tts using the per-second formula
96
  cost_gpt_mini = calculate_cost(chars, len(parsed), "gpt-4o-mini-tts")
97
+ print(f"Estimated cost for gpt-4o-mini-tts (at {CHARS_PER_SECOND_ESTIMATE} chars/sec): ${cost_gpt_mini:.6f}")
98
+
99
+
100
+ print(f"\n--- Test Case 2: Long Script (Boundary Check) ---")
101
+ long_script_text = "[SpeakerA] " + "a" * (MAX_SCRIPT_LENGTH - 11) # 11 chars for "[SpeakerA] "
102
+ parsed_long, chars_long = parse_dialogue_script(long_script_text)
103
+ print(f"Long script (length {len(long_script_text)} chars) parsed successfully. TTS Chars: {chars_long}")
104
 
 
105
  try:
106
+ too_long_script = "a" * (MAX_SCRIPT_LENGTH + 1)
107
+ parse_dialogue_script(too_long_script)
108
  except ValueError as e:
109
+ print(f"Correctly caught error for too long script: {e}")
110
+
111
+ print(f"\n--- Test Case 3: Empty and Invalid Scripts ---")
112
+ parsed_empty, chars_empty = parse_dialogue_script("")
113
+ print(f"Empty script: Parsed lines: {len(parsed_empty)}, Chars: {chars_empty}")
114
+ parsed_blank_lines, chars_blank_lines = parse_dialogue_script("\n\n[Speaker]\n\n")
115
+ print(f"Script with blank/invalid lines: Parsed lines: {len(parsed_blank_lines)}, Chars: {chars_blank_lines} (Result: {parsed_blank_lines})")