abocha commited on
Commit
5c85d81
·
1 Parent(s): b26f1ff

speed, instructions, granular voice change, tts-1 avainlable

Browse files
Files changed (2) hide show
  1. app.py +290 -151
  2. utils/openai_tts.py +58 -46
app.py CHANGED
@@ -5,20 +5,20 @@ import tempfile
5
  import shutil
6
  import zipfile
7
  import random
 
8
  from openai import AsyncOpenAI
9
 
10
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
11
- from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES
12
  from utils.merge_audio import merge_mp3_files
13
 
14
  # --- Configuration ---
15
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
- NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE") # e.g., "https://api.example.com/nsfw-check?text={text}"
17
- MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd") # Default model
18
 
19
  # Ensure API key is available
20
  if not OPENAI_API_KEY:
21
- # Try to read from Hugging Face secrets if running in a Space
22
  try:
23
  from huggingface_hub import HfApi
24
  api = HfApi()
@@ -30,82 +30,170 @@ if not OPENAI_API_KEY:
30
  MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
31
  except Exception as e:
32
  print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
33
- # Potentially raise an error or disable functionality if key is essential
34
- # For now, we'll let it proceed, and OpenAI client init will fail later if key is still None.
35
 
36
- # Initialize OpenAI client if key is found
37
  async_openai_client = None
38
  if OPENAI_API_KEY:
39
  async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
40
  else:
41
- # This will be displayed in the UI if the API key is missing
42
- print("ERROR: OPENAI_API_KEY secret is not set. The application will not function.")
43
 
 
 
 
 
44
 
45
- SPEAKER_VOICE_MAPPING_PRESETS = ["Single voice (alloy)", "Random per Speaker", "A/B (alloy, echo, ...)"]
46
- TTS_MODELS = ["tts-1-hd", "gpt-4o-mini-tts"]
47
 
48
- # Global store for speaker_to_voice mapping to persist across Gradio calls if needed
49
- # (though for this app, it's recalculated each time based on preset)
50
- _speaker_voice_map_cache = {}
 
 
 
 
51
 
52
- def get_voice_for_speaker(speaker, preset, parsed_lines):
53
- global _speaker_voice_map_cache
54
-
55
- unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
56
 
57
- if preset == "Single voice (alloy)":
58
- return OPENAI_VOICES[0] # Default to 'alloy' or first available
59
-
60
- if preset == "Random per Speaker":
61
- if speaker not in _speaker_voice_map_cache or _speaker_voice_map_cache.get("__preset") != preset:
62
- # Re-initialize map if preset changed or speaker not found
63
- _speaker_voice_map_cache = {"__preset": preset}
64
- # Assign a random voice to each unique speaker
65
- available_voices_shuffled = random.sample(OPENAI_VOICES, len(OPENAI_VOICES))
66
- for i, spk in enumerate(unique_speakers):
67
- _speaker_voice_map_cache[spk] = available_voices_shuffled[i % len(available_voices_shuffled)]
68
- return _speaker_voice_map_cache.get(speaker, OPENAI_VOICES[0]) # Fallback
69
-
70
- if preset == "A/B (alloy, echo, ...)":
71
- if speaker not in _speaker_voice_map_cache or _speaker_voice_map_cache.get("__preset") != preset:
72
- _speaker_voice_map_cache = {"__preset": preset}
73
- for i, spk in enumerate(unique_speakers):
74
- _speaker_voice_map_cache[spk] = OPENAI_VOICES[i % len(OPENAI_VOICES)]
75
- return _speaker_voice_map_cache.get(speaker, OPENAI_VOICES[0]) # Fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- return OPENAI_VOICES[0] # Default fallback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  async def handle_script_processing(
80
  dialogue_script: str,
81
  tts_model: str,
82
  pause_ms: int,
83
- speaker_voice_preset: str,
 
 
 
 
84
  progress=gr.Progress(track_tqdm=True)
85
  ):
86
- global _speaker_voice_map_cache
87
- _speaker_voice_map_cache = {} # Reset speaker voice map for each new run
88
 
89
  if not OPENAI_API_KEY or not async_openai_client:
90
  return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
91
-
92
  if not dialogue_script.strip():
93
  return None, None, "Error: Script is empty."
94
 
95
- # Ensure /tmp exists (it should on HF Spaces)
96
- os.makedirs("/tmp", exist_ok=True)
97
-
98
- # Create a unique temporary directory for this job's files
99
- # This helps in cleaning up and avoiding conflicts if /tmp is shared/persistent
100
- # temp_job_dir = tempfile.mkdtemp(dir="/tmp", prefix="dialogue_tts_")
101
- # Using a fixed sub-directory in /tmp for simplicity for now, and clearing it.
102
- # For more robust multi-user on same instance (not free tier concern), mkdtemp is better.
103
-
104
- # Path for this job's audio files within /tmp
105
- # Using a more specific path for cleanup
106
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
107
-
108
- # Clear previous job's files from the specific prefix directory
109
  if os.path.exists(job_audio_path_prefix):
110
  shutil.rmtree(job_audio_path_prefix)
111
  os.makedirs(job_audio_path_prefix, exist_ok=True)
@@ -113,34 +201,57 @@ async def handle_script_processing(
113
  try:
114
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
115
  if not parsed_lines:
116
- # shutil.rmtree(job_audio_path_prefix) # Clean up
117
- return None, None, "Error: No valid dialogue lines found in the script."
118
- except ValueError as e: # Handles MAX_SCRIPT_LENGTH
119
- # shutil.rmtree(job_audio_path_prefix) # Clean up
120
  return None, None, f"Script parsing error: {str(e)}"
121
 
122
- tasks = []
123
- line_audio_files = [None] * len(parsed_lines) # To store paths in original order
 
 
 
 
 
124
 
125
- # Determine voices for speakers based on preset
126
- # This loop is just to pre-populate the cache based on all unique speakers
127
- # The actual voice assignment per line happens inside the task creation loop
128
- # To ensure stability of voice per speaker if "Random" is chosen.
129
- _ = [get_voice_for_speaker(p["speaker"], speaker_voice_preset, parsed_lines) for p in parsed_lines]
130
 
131
  for i, line_data in enumerate(parsed_lines):
132
- voice = get_voice_for_speaker(line_data["speaker"], speaker_voice_preset, parsed_lines)
133
- output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
 
 
 
 
 
 
 
 
 
 
134
 
135
- progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({line_data['speaker']})")
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  tasks.append(
138
  synthesize_speech_line(
139
  client=async_openai_client,
140
  text=line_data["text"],
141
- voice=voice,
142
  output_path=output_filename,
143
  model=tts_model,
 
 
144
  nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
145
  line_index=line_data['id']
146
  )
@@ -148,60 +259,29 @@ async def handle_script_processing(
148
 
149
  synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
150
 
151
- # Place results into line_audio_files based on original line_data['id'] if possible,
152
- # or simply by order of completion if IDs are not perfectly mapping (should not happen with current setup).
153
- # For now, assuming `synthesis_results` order matches `parsed_lines` due to `asyncio.gather` preserving order.
154
  for idx, result in enumerate(synthesis_results):
155
  if isinstance(result, Exception):
156
- # Log the error, the file will remain None
157
  print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
158
- # Potentially provide feedback to user about failed lines
159
- elif result is None: # Synthesis skipped (e.g. NSFW) or failed internally
160
  print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
161
  else:
162
- # Store the path of the successfully synthesized file
163
- # We need to map 'idx' from gather result back to original 'id' if they differ
164
- # For now, assume parsed_lines[idx]['id'] is the relevant one.
165
- # line_audio_files is already indexed by 'idx' which corresponds to parsed_lines order.
166
  line_audio_files[idx] = result
167
 
168
  valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
169
 
170
  if not valid_audio_files:
171
- shutil.rmtree(job_audio_path_prefix) # Clean up
172
- return None, None, "Error: No audio files were successfully synthesized. Check logs for details."
173
 
174
- # 1. Create ZIP of individual lines
175
  zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
176
  with zipfile.ZipFile(zip_filename, 'w') as zf:
177
  for audio_file_path in valid_audio_files:
178
  zf.write(audio_file_path, os.path.basename(audio_file_path))
179
 
180
- # 2. Create merged MP3
181
- # We need to pass the *ordered* list of successfully generated files for merging
182
- # `line_audio_files` contains paths or None, in the original script order.
183
  ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
184
-
185
  merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
186
  merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
187
 
188
- if not merged_output_path:
189
- # Merging failed, but we might still have the zip.
190
- # Return zip, and None for merged, with an error message.
191
- # For simplicity now, let's consider this a partial success if zip is there.
192
- # Or, make it an error:
193
- # shutil.rmtree(job_audio_path_prefix)
194
- # return None, None, "Error: Failed to merge audio files, though individual lines might be available."
195
- # Let's return what we have
196
- print("Warning: Merged MP3 generation failed. Individual files might still be in ZIP.")
197
- # We still return the zip path, and None for merged path
198
-
199
- # Gradio output: individual_lines_zip, merged_dialogue_mp3, status_message
200
- # If merged_output_path is None, Gradio will show nothing for that component if its type expects a file.
201
- # It's better to return a string path, even if the file doesn't exist, and let Gradio handle it,
202
- # or handle it by returning None and ensure the component can take None.
203
- # For File output, returning None is fine.
204
-
205
  final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
206
  if len(valid_audio_files) < len(parsed_lines):
207
  final_status += "Some lines failed or were skipped. "
@@ -212,20 +292,6 @@ async def handle_script_processing(
212
  else:
213
  final_status += "Outputs generated."
214
 
215
- # Make copies of the files to a location Gradio can serve if they are in the job_audio_path_prefix
216
- # which might be cleaned up. Gradio makes its own copies for File output components.
217
- # So, returning paths from job_audio_path_prefix is fine.
218
-
219
- # Return paths for Gradio File components
220
- # Gradio will make these downloadable.
221
- # If a file doesn't exist or is None, the Gradio component should handle it gracefully (e.g., show nothing).
222
-
223
- # Intentionally DO NOT clean up job_audio_path_prefix here.
224
- # Gradio needs access to these files to serve them.
225
- # Cleanup should happen at the start of the *next* run, or via a different mechanism
226
- # if HF Spaces doesn't clean /tmp periodically.
227
- # The current strategy of clearing job_audio_path_prefix at the start of handle_script_processing is good.
228
-
229
  return zip_filename if os.path.exists(zip_filename) else None, \
230
  merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
231
  final_status
@@ -240,44 +306,72 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
240
  return "Cost: $0.000000 (No valid lines)"
241
  cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
242
  return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
243
- except ValueError as e: # Handles MAX_SCRIPT_LENGTH
244
  return f"Error: {str(e)}"
245
  except Exception as e:
246
  return f"Error calculating cost: {str(e)}"
247
 
248
  # --- Gradio Interface ---
249
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
250
- gr.Markdown("# Dialogue Script to Speech Converter")
251
  gr.Markdown(
252
- "Convert an entire dialogue script into individual MP3 files for each line and a single merged MP3. "
253
- "Supports `tts-1-hd` and `gpt-4o-mini-tts` from OpenAI. "
254
- "Uses a maximum of 2 concurrent TTS requests."
255
  )
256
  if not OPENAI_API_KEY or not async_openai_client:
257
- gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. The application will not be able to synthesize audio.</h3>")
258
-
259
 
260
  with gr.Row():
261
  with gr.Column(scale=2):
262
  script_input = gr.TextArea(
263
  label="Dialogue Script",
264
- placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi! You are a bold one.",
265
  lines=10,
266
  info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
267
  )
268
  with gr.Column(scale=1):
269
  tts_model_dropdown = gr.Dropdown(
270
- TTS_MODELS, label="OpenAI TTS Model", value=MODEL_DEFAULT, info="Select TTS model."
271
- )
272
- speaker_voice_preset_dropdown = gr.Dropdown(
273
- SPEAKER_VOICE_MAPPING_PRESETS, label="Speaker Voice Assignment", value=SPEAKER_VOICE_MAPPING_PRESETS[0],
274
- info="How voices are assigned to speakers."
275
  )
276
  pause_input = gr.Number(
277
  label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
278
  info="Silence duration in milliseconds between merged lines."
279
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  with gr.Row():
282
  calculate_cost_button = gr.Button("Calculate Estimated Cost")
283
  generate_button = gr.Button("Generate Audio Files", variant="primary")
@@ -288,8 +382,38 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
288
  individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
289
  merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
290
 
291
- status_output = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  calculate_cost_button.click(
294
  fn=handle_calculate_cost,
295
  inputs=[script_input, tts_model_dropdown],
@@ -298,34 +422,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
298
 
299
  generate_button.click(
300
  fn=handle_script_processing,
301
- inputs=[script_input, tts_model_dropdown, pause_input, speaker_voice_preset_dropdown],
 
 
 
 
302
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
303
- # api_name="generate_audio" # if you want to expose an API endpoint
304
  )
305
 
306
  gr.Markdown("## Examples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  gr.Examples(
308
  examples=[
309
- [
310
- "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more.",
311
- "tts-1-hd", 250, SPEAKER_VOICE_MAPPING_PRESETS[1]
312
- ],
313
- [
314
- "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain.\n[Captain Eva] Excellent. Maintain course for Kepler-186f.",
315
- "gpt-4o-mini-tts", 600, SPEAKER_VOICE_MAPPING_PRESETS[2]
316
- ],
317
- [
318
- "A single line of narration, no speaker tag initially. This will be auto-assigned to 'Narrator'.",
319
- "tts-1", 0, SPEAKER_VOICE_MAPPING_PRESETS[0] # tts-1 to show it works too
320
- ]
321
  ],
322
- inputs=[script_input, tts_model_dropdown, pause_input, speaker_voice_preset_dropdown],
323
- outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output], # Outputs for example click
324
- fn=handle_script_processing, # Function to run for examples
325
- cache_examples=False, # Set to True if inputs/outputs are static and large
 
 
 
 
326
  )
327
 
328
  if __name__ == "__main__":
329
- if os.name == 'nt': # For Windows compatibility with asyncio if needed
330
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
331
  demo.launch()
 
5
  import shutil
6
  import zipfile
7
  import random
8
+ import json
9
  from openai import AsyncOpenAI
10
 
11
  from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
12
+ from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES # Renamed for clarity
13
  from utils.merge_audio import merge_mp3_files
14
 
15
  # --- Configuration ---
16
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17
+ NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
18
+ MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
19
 
20
  # Ensure API key is available
21
  if not OPENAI_API_KEY:
 
22
  try:
23
  from huggingface_hub import HfApi
24
  api = HfApi()
 
30
  MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
31
  except Exception as e:
32
  print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
 
 
33
 
 
34
  async_openai_client = None
35
  if OPENAI_API_KEY:
36
  async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
37
  else:
38
+ print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
 
39
 
40
+ # TTS Models available in the UI
41
+ TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
42
+ if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE: # Ensure default is valid
43
+ MODEL_DEFAULT = "tts-1-hd"
44
 
 
 
45
 
46
+ SPEAKER_CONFIG_METHODS = [
47
+ "Single Voice (Global)",
48
+ "Random per Speaker",
49
+ "A/B Round Robin",
50
+ "Detailed Configuration (JSON)"
51
+ ]
52
+ DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
53
 
54
+ # For UI elements that need the list of voices
55
+ APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
 
 
56
 
57
+
58
+ # Global store for parsed speaker configurations
59
+ _speaker_config_cache = {}
60
+
61
+ def parse_detailed_speaker_config(json_text, parsed_script_lines):
62
+ """ Parses the JSON config string into a speaker map. """
63
+ config_map = {}
64
+ default_voice = APP_AVAILABLE_VOICES[0]
65
+ try:
66
+ if not json_text.strip():
67
+ return {} # Empty config means rely on global or other fallbacks
68
+
69
+ config_list = json.loads(json_text)
70
+ if not isinstance(config_list, list):
71
+ raise ValueError("JSON config must be a list of speaker objects.")
72
+
73
+ for item in config_list:
74
+ if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
75
+ # Log warning or skip malformed item
76
+ print(f"Skipping malformed item in JSON config: {item}")
77
+ continue
78
+ if item["voice"] not in APP_AVAILABLE_VOICES:
79
+ print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
80
+ item["voice"] = default_voice
81
+
82
+ # Validate speed if present
83
+ if "speed" in item:
84
+ try:
85
+ item["speed"] = float(item["speed"])
86
+ if not (0.25 <= item["speed"] <= 4.0):
87
+ print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping or defaulting.")
88
+ item["speed"] = max(0.25, min(item["speed"], 4.0)) # Clamp
89
+ except ValueError:
90
+ print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
91
+ # Let it be None or remove, so global/default speed applies
92
+ item.pop("speed", None)
93
+
94
+
95
+ config_map[item["speaker"]] = {
96
+ "voice": item["voice"],
97
+ "speed": item.get("speed"), # Will be None if not present or invalid
98
+ "instructions": item.get("instructions") # Will be None if not present
99
+ }
100
+ return config_map
101
+ except json.JSONDecodeError as e:
102
+ raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
103
+ except ValueError as e: # Catch our own ValueErrors
104
+ raise e # Re-raise
105
+ except Exception as e: # Catch any other unexpected errors during parsing
106
+ raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
107
+
108
+
109
+ def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
110
+ global_selected_voice, detailed_config_map):
111
+ global _speaker_config_cache # This cache helps maintain consistency for "Random" and "A/B" within a run
112
+
113
+ # If method changed or cache is for a different set of speakers, reset it.
114
+ # A more robust cache key might involve hashing unique_script_speakers.
115
+ if _speaker_config_cache.get("__method") != speaker_config_method or \
116
+ _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
117
+ _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
118
+
119
+ base_config = {"voice": APP_AVAILABLE_VOICES[0], "speed": None, "instructions": None}
120
+
121
+ if speaker_config_method == "Single Voice (Global)":
122
+ base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
123
+ return base_config
124
 
125
+ if speaker_config_method == "Detailed Configuration (JSON)":
126
+ if speaker_name in detailed_config_map:
127
+ # Merge with base_config to ensure all keys are present if some are optional in JSON
128
+ # JSON values take precedence
129
+ speaker_specific = detailed_config_map[speaker_name]
130
+ return {
131
+ "voice": speaker_specific.get("voice", base_config["voice"]),
132
+ "speed": speaker_specific.get("speed"), # Allow None to use global
133
+ "instructions": speaker_specific.get("instructions") # Allow None to use global
134
+ }
135
+ else: # Fallback for speakers in script but not in JSON map
136
+ # Could use a default voice, or a cycling voice for unmapped speakers
137
+ # For now, let's use the first voice as a simple fallback.
138
+ print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
139
+ return base_config
140
+
141
+
142
+ # For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
143
+ if speaker_name not in _speaker_config_cache:
144
+ if speaker_config_method == "Random per Speaker":
145
+ # Assign a random voice if not already cached for this run
146
+ available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
147
+ # Ensure all unique speakers get an assignment before reusing voices from cache build
148
+ if not _speaker_config_cache.get("__all_assigned_random"):
149
+ for i, spk_unique in enumerate(unique_script_speakers):
150
+ if spk_unique not in _speaker_config_cache:
151
+ _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
152
+ _speaker_config_cache["__all_assigned_random"] = True
153
+
154
+ # If somehow still not found (should not happen if pre-populated)
155
+ if speaker_name not in _speaker_config_cache:
156
+ _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
157
+
158
+ elif speaker_config_method == "A/B Round Robin":
159
+ # Assign voices in order if not already cached
160
+ if not _speaker_config_cache.get("__all_assigned_ab"):
161
+ for i, spk_unique in enumerate(unique_script_speakers):
162
+ if spk_unique not in _speaker_config_cache:
163
+ _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
164
+ _speaker_config_cache["__all_assigned_ab"] = True
165
+
166
+ if speaker_name not in _speaker_config_cache: # Fallback, should be populated
167
+ speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
168
+ _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
169
+ else: # Should not happen
170
+ _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
171
+
172
+ # Return only voice for Random/AB, speed/instructions will be from global UI inputs
173
+ cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
174
+ return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
175
+
176
 
177
  async def handle_script_processing(
178
  dialogue_script: str,
179
  tts_model: str,
180
  pause_ms: int,
181
+ speaker_config_method: str,
182
+ global_voice_selection: str, # From dropdown if "Single Voice (Global)"
183
+ detailed_speaker_json: str, # From JSON input
184
+ global_speed: float,
185
+ global_instructions: str,
186
  progress=gr.Progress(track_tqdm=True)
187
  ):
188
+ global _speaker_config_cache
189
+ _speaker_config_cache = {} # Reset speaker config cache for each new run
190
 
191
  if not OPENAI_API_KEY or not async_openai_client:
192
  return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
 
193
  if not dialogue_script.strip():
194
  return None, None, "Error: Script is empty."
195
 
 
 
 
 
 
 
 
 
 
 
 
196
  job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
 
 
197
  if os.path.exists(job_audio_path_prefix):
198
  shutil.rmtree(job_audio_path_prefix)
199
  os.makedirs(job_audio_path_prefix, exist_ok=True)
 
201
  try:
202
  parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
203
  if not parsed_lines:
204
+ return None, None, "Error: No valid dialogue lines found."
205
+ except ValueError as e:
 
 
206
  return None, None, f"Script parsing error: {str(e)}"
207
 
208
+ unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
209
+ parsed_detailed_config_map = {}
210
+ if speaker_config_method == "Detailed Configuration (JSON)":
211
+ try:
212
+ parsed_detailed_config_map = parse_detailed_speaker_config(detailed_speaker_json, parsed_lines)
213
+ except ValueError as e:
214
+ return None, None, f"Configuration Error: {str(e)}"
215
 
216
+ tasks = []
217
+ line_audio_files = [None] * len(parsed_lines)
 
 
 
218
 
219
  for i, line_data in enumerate(parsed_lines):
220
+ speaker_name = line_data["speaker"]
221
+
222
+ # Get base config (primarily voice) based on method
223
+ # For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
224
+ # For "Single Voice", it returns the globally selected voice.
225
+ # For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
226
+ speaker_base_cfg = get_config_for_speaker(
227
+ speaker_name, speaker_config_method, unique_speakers,
228
+ global_voice_selection, parsed_detailed_config_map
229
+ )
230
+
231
+ line_voice = speaker_base_cfg["voice"]
232
 
233
+ # Determine effective speed and instructions
234
+ # Priority: Per-speaker from JSON > Global UI > API Default (1.0 for speed, None for instructions)
235
+ effective_speed = global_speed # Start with global
236
+ if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
237
+ effective_speed = speaker_base_cfg["speed"]
238
+
239
+ effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
240
+ if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip(): # If JSON provided instructions
241
+ effective_instructions = speaker_base_cfg["instructions"]
242
+
243
+ output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
244
+ progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
245
 
246
  tasks.append(
247
  synthesize_speech_line(
248
  client=async_openai_client,
249
  text=line_data["text"],
250
+ voice=line_voice,
251
  output_path=output_filename,
252
  model=tts_model,
253
+ speed=effective_speed,
254
+ instructions=effective_instructions,
255
  nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
256
  line_index=line_data['id']
257
  )
 
259
 
260
  synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
261
 
 
 
 
262
  for idx, result in enumerate(synthesis_results):
263
  if isinstance(result, Exception):
 
264
  print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
265
+ elif result is None:
 
266
  print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
267
  else:
 
 
 
 
268
  line_audio_files[idx] = result
269
 
270
  valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
271
 
272
  if not valid_audio_files:
273
+ shutil.rmtree(job_audio_path_prefix)
274
+ return None, None, "Error: No audio files were successfully synthesized."
275
 
 
276
  zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
277
  with zipfile.ZipFile(zip_filename, 'w') as zf:
278
  for audio_file_path in valid_audio_files:
279
  zf.write(audio_file_path, os.path.basename(audio_file_path))
280
 
 
 
 
281
  ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
 
282
  merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
283
  merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
286
  if len(valid_audio_files) < len(parsed_lines):
287
  final_status += "Some lines failed or were skipped. "
 
292
  else:
293
  final_status += "Outputs generated."
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  return zip_filename if os.path.exists(zip_filename) else None, \
296
  merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
297
  final_status
 
306
  return "Cost: $0.000000 (No valid lines)"
307
  cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
308
  return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
309
+ except ValueError as e:
310
  return f"Error: {str(e)}"
311
  except Exception as e:
312
  return f"Error calculating cost: {str(e)}"
313
 
314
  # --- Gradio Interface ---
315
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
316
+ gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
317
  gr.Markdown(
318
+ "Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
 
 
319
  )
320
  if not OPENAI_API_KEY or not async_openai_client:
321
+ gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. Speech synthesis will fail.</h3>")
 
322
 
323
  with gr.Row():
324
  with gr.Column(scale=2):
325
  script_input = gr.TextArea(
326
  label="Dialogue Script",
327
+ placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
328
  lines=10,
329
  info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
330
  )
331
  with gr.Column(scale=1):
332
  tts_model_dropdown = gr.Dropdown(
333
+ TTS_MODELS_AVAILABLE, label="OpenAI TTS Model", value=MODEL_DEFAULT,
334
+ info="Select TTS model. Affects available controls below."
 
 
 
335
  )
336
  pause_input = gr.Number(
337
  label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
338
  info="Silence duration in milliseconds between merged lines."
339
  )
340
+ # Model-specific global controls
341
+ global_speed_input = gr.Slider(
342
+ minimum=0.25, maximum=4.0, value=1.0, step=0.05,
343
+ label="Global Speech Speed (for tts-1/tts-1-hd)",
344
+ visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
345
+ interactive=True
346
+ )
347
+ global_instructions_input = gr.Textbox(
348
+ label="Global Voice Instructions (for gpt-4o-mini-tts)",
349
+ placeholder="e.g., Speak in a calm, reassuring tone.",
350
+ visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
351
+ interactive=True, lines=2
352
+ )
353
 
354
+ gr.Markdown("### Speaker Configuration")
355
+ with gr.Row():
356
+ speaker_config_method_dropdown = gr.Dropdown(
357
+ SPEAKER_CONFIG_METHODS, label="Speaker Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
358
+ )
359
+ global_voice_dropdown = gr.Dropdown(
360
+ APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
361
+ value=APP_AVAILABLE_VOICES[0],
362
+ visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
363
+ interactive=True
364
+ )
365
+
366
+ detailed_speaker_config_input = gr.Code(
367
+ label="Detailed Speaker Configuration (JSON)",
368
+ language="json",
369
+ lines=7,
370
+ placeholder='[\n {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},\n {"speaker": "Bob", "voice": "echo"},\n {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}\n]',
371
+ visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)"),
372
+ info="Define voice, and optionally model-compatible speed/instructions per speaker."
373
+ )
374
+
375
  with gr.Row():
376
  calculate_cost_button = gr.Button("Calculate Estimated Cost")
377
  generate_button = gr.Button("Generate Audio Files", variant="primary")
 
382
  individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
383
  merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
384
 
385
+ status_output = gr.Textbox(label="Status", interactive=False, lines=2)
386
+
387
+ # --- Event Handlers for UI Interactivity ---
388
+ def update_model_specific_controls_visibility(selected_model):
389
+ is_tts_1_family = selected_model in ["tts-1", "tts-1-hd"]
390
+ is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
391
+ return {
392
+ global_speed_input: gr.update(visible=is_tts_1_family, interactive=is_tts_1_family),
393
+ global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
394
+ }
395
+
396
+ tts_model_dropdown.change(
397
+ fn=update_model_specific_controls_visibility,
398
+ inputs=[tts_model_dropdown],
399
+ outputs=[global_speed_input, global_instructions_input]
400
+ )
401
 
402
+ def update_speaker_config_visibility(config_method):
403
+ is_single_voice = (config_method == "Single Voice (Global)")
404
+ is_detailed_json = (config_method == "Detailed Configuration (JSON)")
405
+ return {
406
+ global_voice_dropdown: gr.update(visible=is_single_voice, interactive=is_single_voice),
407
+ detailed_speaker_config_input: gr.update(visible=is_detailed_json, interactive=is_detailed_json)
408
+ }
409
+
410
+ speaker_config_method_dropdown.change(
411
+ fn=update_speaker_config_visibility,
412
+ inputs=[speaker_config_method_dropdown],
413
+ outputs=[global_voice_dropdown, detailed_speaker_config_input]
414
+ )
415
+
416
+ # --- Main Button Actions ---
417
  calculate_cost_button.click(
418
  fn=handle_calculate_cost,
419
  inputs=[script_input, tts_model_dropdown],
 
422
 
423
  generate_button.click(
424
  fn=handle_script_processing,
425
+ inputs=[
426
+ script_input, tts_model_dropdown, pause_input,
427
+ speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
428
+ global_speed_input, global_instructions_input
429
+ ],
430
  outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
 
431
  )
432
 
433
  gr.Markdown("## Examples")
434
+ example_script_1 = "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more."
435
+ example_json_1 = """
436
+ [
437
+ {"speaker": "Alice", "voice": "nova", "instructions": "sound curious"},
438
+ {"speaker": "Bob", "voice": "echo", "speed": 0.9}
439
+ ]
440
+ """.strip()
441
+
442
+ example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
443
+ example_json_2 = """
444
+ [
445
+ {"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
446
+ {"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
447
+ ]
448
+ """.strip()
449
+
450
+
451
  gr.Examples(
452
  examples=[
453
+ [example_script_1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_1, 1.0, "Speak naturally."],
454
+ [example_script_2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_2, 1.1, ""],
455
+ ["[Narrator] A single line, using global settings.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]
 
 
 
 
 
 
 
 
 
456
  ],
457
+ inputs=[
458
+ script_input, tts_model_dropdown, pause_input,
459
+ speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
460
+ global_speed_input, global_instructions_input
461
+ ],
462
+ outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
463
+ fn=handle_script_processing,
464
+ cache_examples=False,
465
  )
466
 
467
  if __name__ == "__main__":
468
+ if os.name == 'nt':
469
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
470
  demo.launch()
utils/openai_tts.py CHANGED
@@ -4,8 +4,8 @@ import time
4
  from openai import AsyncOpenAI, OpenAIError, RateLimitError
5
  import httpx # For NSFW check
6
 
7
- # Available voices for OpenAI TTS
8
- OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
9
 
10
  # Concurrency limiter
11
  MAX_CONCURRENT_REQUESTS = 2
@@ -21,33 +21,27 @@ async def is_content_safe(text: str, api_url_template: str | None) -> bool:
21
  Returns True if safe or if API URL is not provided, False if unsafe.
22
  """
23
  if not api_url_template:
24
- return True # Assume safe if no NSFW checker is configured
25
 
26
- # Basic template check - expecting {text} placeholder
27
  if "{text}" not in api_url_template:
28
  print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
29
  return True
30
 
31
  try:
32
- encoded_text = httpx.utils.quote(text) # URL encode the text
33
  url = api_url_template.format(text=encoded_text)
34
 
35
  async with httpx.AsyncClient() as client:
36
  response = await client.get(url, timeout=10.0)
37
 
38
- # Assuming 200 OK means "safe" and other statuses might mean "unsafe" or error
39
- # This logic might need adjustment based on the specific API's response codes
40
  if response.status_code == 200:
41
- # Further, check response content if API specifies (e.g., JSON payload)
42
- # For a generic template, we'll assume 200 means safe.
43
- # Example: response_json = response.json(); return response_json.get("is_safe", False)
44
  return True
45
  else:
46
  print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
47
  return False
48
  except httpx.RequestError as e:
49
  print(f"NSFW Check: API request error: {e}")
50
- return False # Fail safe (consider content unsafe on API error)
51
  except Exception as e:
52
  print(f"NSFW Check: An unexpected error occurred: {e}")
53
  return False
@@ -59,36 +53,49 @@ async def synthesize_speech_line(
59
  voice: str,
60
  output_path: str,
61
  model: str = "tts-1-hd",
 
 
62
  nsfw_api_url_template: str | None = None,
63
- line_index: int = -1 # For logging purposes
64
  ) -> str | None:
65
  """
66
  Synthesizes a single line of text to speech using OpenAI TTS.
 
67
  Retries on RateLimitError with exponential backoff.
68
  Returns the output_path if successful, None otherwise.
69
  """
70
  if nsfw_api_url_template:
71
  if not await is_content_safe(text, nsfw_api_url_template):
72
  print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
73
- # Create a silent MP3 or handle as needed; here we skip and return None
74
- # This could also raise an exception to halt processing.
75
  return None
76
 
77
-
78
  current_retry = 0
79
  backoff_seconds = INITIAL_BACKOFF_SECONDS
80
 
81
- async with semaphore: # Acquire semaphore before making API call
82
  while current_retry < MAX_RETRIES:
83
  try:
84
- response = await client.audio.speech.create(
85
- model=model,
86
- voice=voice,
87
- input=text,
88
- response_format="mp3"
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  await response.astream_to_file(output_path)
91
- # print(f"Successfully synthesized line {line_index if line_index !=-1 else ''} to {output_path} using voice {voice}")
92
  return output_path
93
  except RateLimitError as e:
94
  current_retry += 1
@@ -97,19 +104,16 @@ async def synthesize_speech_line(
97
  return None
98
  print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
99
  await asyncio.sleep(backoff_seconds)
100
- backoff_seconds *= 2 # Exponential backoff
101
  except OpenAIError as e:
102
  print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
103
- return None # Other OpenAI errors
104
  except Exception as e:
105
  print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
106
  return None
107
- return None # Should not be reached if semaphore logic is correct, but as a fallback.
108
-
109
 
110
  if __name__ == '__main__':
111
- # This is a basic test and requires OPENAI_API_KEY to be set in environment
112
- # and a temporary directory to exist or be created.
113
  async def main_test():
114
  api_key = os.getenv("OPENAI_API_KEY")
115
  if not api_key:
@@ -119,22 +123,37 @@ if __name__ == '__main__':
119
  client = AsyncOpenAI(api_key=api_key)
120
 
121
  test_lines = [
122
- {"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice."},
123
- {"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice."},
124
- {"id": 2, "speaker": "Alice", "text": "A short reply."},
125
- {"id": 3, "speaker": "Charlie", "text": "Charlie here, with a much longer line of dialogue to see how the concurrency and potential rate limits are handled by the system when multiple requests are fired off."}
126
  ]
127
 
128
- temp_dir = "test_audio_output"
129
  os.makedirs(temp_dir, exist_ok=True)
130
 
131
  tasks = []
132
  for i, line_data in enumerate(test_lines):
133
- voice = OPENAI_VOICES[i % len(OPENAI_VOICES)] # Cycle through voices
134
- output_file = os.path.join(temp_dir, f"line_{line_data['id']}.mp3")
135
- # Set model to tts-1 for faster testing if desired, default is tts-1-hd
 
 
 
 
 
 
136
  tasks.append(
137
- synthesize_speech_line(client, line_data["text"], voice, output_file, model="tts-1", line_index=line_data['id'])
 
 
 
 
 
 
 
 
 
138
  )
139
 
140
  results = await asyncio.gather(*tasks)
@@ -143,14 +162,7 @@ if __name__ == '__main__':
143
  print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
144
  for f_path in successful_files:
145
  print(f" - {f_path}")
146
-
147
- # Clean up test files (optional)
148
- # for f_path in successful_files:
149
- # os.remove(f_path)
150
- # if not os.listdir(temp_dir): # only remove if empty
151
- # os.rmdir(temp_dir)
152
-
153
 
154
- if os.name == 'nt': # For Windows compatibility with asyncio if needed
155
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
156
  asyncio.run(main_test())
 
4
  from openai import AsyncOpenAI, OpenAIError, RateLimitError
5
  import httpx # For NSFW check
6
 
7
+ # Expanded list of voices based on recent OpenAI documentation
8
+ OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer', 'ash', 'ballad', 'coral', 'sage', 'verse']
9
 
10
  # Concurrency limiter
11
  MAX_CONCURRENT_REQUESTS = 2
 
21
  Returns True if safe or if API URL is not provided, False if unsafe.
22
  """
23
  if not api_url_template:
24
+ return True
25
 
 
26
  if "{text}" not in api_url_template:
27
  print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
28
  return True
29
 
30
  try:
31
+ encoded_text = httpx.utils.quote(text)
32
  url = api_url_template.format(text=encoded_text)
33
 
34
  async with httpx.AsyncClient() as client:
35
  response = await client.get(url, timeout=10.0)
36
 
 
 
37
  if response.status_code == 200:
 
 
 
38
  return True
39
  else:
40
  print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
41
  return False
42
  except httpx.RequestError as e:
43
  print(f"NSFW Check: API request error: {e}")
44
+ return False
45
  except Exception as e:
46
  print(f"NSFW Check: An unexpected error occurred: {e}")
47
  return False
 
53
  voice: str,
54
  output_path: str,
55
  model: str = "tts-1-hd",
56
+ speed: float = 1.0,
57
+ instructions: str | None = None,
58
  nsfw_api_url_template: str | None = None,
59
+ line_index: int = -1
60
  ) -> str | None:
61
  """
62
  Synthesizes a single line of text to speech using OpenAI TTS.
63
+ Includes speed and instructions parameters based on model compatibility.
64
  Retries on RateLimitError with exponential backoff.
65
  Returns the output_path if successful, None otherwise.
66
  """
67
  if nsfw_api_url_template:
68
  if not await is_content_safe(text, nsfw_api_url_template):
69
  print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
 
 
70
  return None
71
 
 
72
  current_retry = 0
73
  backoff_seconds = INITIAL_BACKOFF_SECONDS
74
 
75
+ async with semaphore:
76
  while current_retry < MAX_RETRIES:
77
  try:
78
+ request_params = {
79
+ "model": model,
80
+ "voice": voice,
81
+ "input": text,
82
+ "response_format": "mp3"
83
+ }
84
+
85
+ # Add speed if model supports it and speed is not default
86
+ if model in ["tts-1", "tts-1-hd"]:
87
+ if speed is not None and speed != 1.0: # OpenAI default is 1.0
88
+ # Ensure speed is within valid range for safety, though UI should also constrain this
89
+ clamped_speed = max(0.25, min(speed, 4.0))
90
+ request_params["speed"] = clamped_speed
91
+
92
+ # Add instructions if model supports it and instructions are provided
93
+ # Assuming gpt-4o-mini-tts supports it, and tts-1/tts-1-hd do not.
94
+ if model not in ["tts-1", "tts-1-hd"] and instructions: # Example: gpt-4o-mini-tts
95
+ request_params["instructions"] = instructions
96
+
97
+ response = await client.audio.speech.create(**request_params)
98
  await response.astream_to_file(output_path)
 
99
  return output_path
100
  except RateLimitError as e:
101
  current_retry += 1
 
104
  return None
105
  print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
106
  await asyncio.sleep(backoff_seconds)
107
+ backoff_seconds *= 2
108
  except OpenAIError as e:
109
  print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
110
+ return None
111
  except Exception as e:
112
  print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
113
  return None
114
+ return None
 
115
 
116
  if __name__ == '__main__':
 
 
117
  async def main_test():
118
  api_key = os.getenv("OPENAI_API_KEY")
119
  if not api_key:
 
123
  client = AsyncOpenAI(api_key=api_key)
124
 
125
  test_lines = [
126
+ {"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice, spoken quickly."},
127
+ {"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice with instructions.", "instructions": "Speak in a deep, resonant voice."},
128
+ {"id": 2, "speaker": "Alice", "text": "A short reply, spoken slowly.", "speed": 0.8},
129
+ {"id": 3, "speaker": "Charlie", "text": "Charlie here, normal speed."}
130
  ]
131
 
132
+ temp_dir = "test_audio_output_enhanced"
133
  os.makedirs(temp_dir, exist_ok=True)
134
 
135
  tasks = []
136
  for i, line_data in enumerate(test_lines):
137
+ # Test with specific models to check param compatibility
138
+ # For Alice (speed): tts-1-hd. For Bob (instructions): gpt-4o-mini-tts
139
+ current_model = "tts-1-hd"
140
+ if "instructions" in line_data:
141
+ current_model = "gpt-4o-mini-tts" # Example, ensure this model is available for your key
142
+
143
+ voice = OPENAI_VOICES[i % len(OPENAI_VOICES)]
144
+ output_file = os.path.join(temp_dir, f"line_{line_data['id']}_{current_model}.mp3")
145
+
146
  tasks.append(
147
+ synthesize_speech_line(
148
+ client,
149
+ line_data["text"],
150
+ voice,
151
+ output_file,
152
+ model=current_model,
153
+ speed=line_data.get("speed", 1.0),
154
+ instructions=line_data.get("instructions"),
155
+ line_index=line_data['id']
156
+ )
157
  )
158
 
159
  results = await asyncio.gather(*tasks)
 
162
  print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
163
  for f_path in successful_files:
164
  print(f" - {f_path}")
 
 
 
 
 
 
 
165
 
166
+ if os.name == 'nt':
167
  asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
168
  asyncio.run(main_test())