Spaces:
Running
Running
speed, instructions, granular voice change, tts-1 avainlable
Browse files- app.py +290 -151
- utils/openai_tts.py +58 -46
app.py
CHANGED
@@ -5,20 +5,20 @@ import tempfile
|
|
5 |
import shutil
|
6 |
import zipfile
|
7 |
import random
|
|
|
8 |
from openai import AsyncOpenAI
|
9 |
|
10 |
from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
|
11 |
-
from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES
|
12 |
from utils.merge_audio import merge_mp3_files
|
13 |
|
14 |
# --- Configuration ---
|
15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
-
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
|
17 |
-
MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
|
18 |
|
19 |
# Ensure API key is available
|
20 |
if not OPENAI_API_KEY:
|
21 |
-
# Try to read from Hugging Face secrets if running in a Space
|
22 |
try:
|
23 |
from huggingface_hub import HfApi
|
24 |
api = HfApi()
|
@@ -30,82 +30,170 @@ if not OPENAI_API_KEY:
|
|
30 |
MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
|
31 |
except Exception as e:
|
32 |
print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
|
33 |
-
# Potentially raise an error or disable functionality if key is essential
|
34 |
-
# For now, we'll let it proceed, and OpenAI client init will fail later if key is still None.
|
35 |
|
36 |
-
# Initialize OpenAI client if key is found
|
37 |
async_openai_client = None
|
38 |
if OPENAI_API_KEY:
|
39 |
async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
|
40 |
else:
|
41 |
-
|
42 |
-
print("ERROR: OPENAI_API_KEY secret is not set. The application will not function.")
|
43 |
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
SPEAKER_VOICE_MAPPING_PRESETS = ["Single voice (alloy)", "Random per Speaker", "A/B (alloy, echo, ...)"]
|
46 |
-
TTS_MODELS = ["tts-1-hd", "gpt-4o-mini-tts"]
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
async def handle_script_processing(
|
80 |
dialogue_script: str,
|
81 |
tts_model: str,
|
82 |
pause_ms: int,
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
progress=gr.Progress(track_tqdm=True)
|
85 |
):
|
86 |
-
global
|
87 |
-
|
88 |
|
89 |
if not OPENAI_API_KEY or not async_openai_client:
|
90 |
return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
|
91 |
-
|
92 |
if not dialogue_script.strip():
|
93 |
return None, None, "Error: Script is empty."
|
94 |
|
95 |
-
# Ensure /tmp exists (it should on HF Spaces)
|
96 |
-
os.makedirs("/tmp", exist_ok=True)
|
97 |
-
|
98 |
-
# Create a unique temporary directory for this job's files
|
99 |
-
# This helps in cleaning up and avoiding conflicts if /tmp is shared/persistent
|
100 |
-
# temp_job_dir = tempfile.mkdtemp(dir="/tmp", prefix="dialogue_tts_")
|
101 |
-
# Using a fixed sub-directory in /tmp for simplicity for now, and clearing it.
|
102 |
-
# For more robust multi-user on same instance (not free tier concern), mkdtemp is better.
|
103 |
-
|
104 |
-
# Path for this job's audio files within /tmp
|
105 |
-
# Using a more specific path for cleanup
|
106 |
job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
|
107 |
-
|
108 |
-
# Clear previous job's files from the specific prefix directory
|
109 |
if os.path.exists(job_audio_path_prefix):
|
110 |
shutil.rmtree(job_audio_path_prefix)
|
111 |
os.makedirs(job_audio_path_prefix, exist_ok=True)
|
@@ -113,34 +201,57 @@ async def handle_script_processing(
|
|
113 |
try:
|
114 |
parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
|
115 |
if not parsed_lines:
|
116 |
-
|
117 |
-
|
118 |
-
except ValueError as e: # Handles MAX_SCRIPT_LENGTH
|
119 |
-
# shutil.rmtree(job_audio_path_prefix) # Clean up
|
120 |
return None, None, f"Script parsing error: {str(e)}"
|
121 |
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
|
126 |
-
|
127 |
-
# The actual voice assignment per line happens inside the task creation loop
|
128 |
-
# To ensure stability of voice per speaker if "Random" is chosen.
|
129 |
-
_ = [get_voice_for_speaker(p["speaker"], speaker_voice_preset, parsed_lines) for p in parsed_lines]
|
130 |
|
131 |
for i, line_data in enumerate(parsed_lines):
|
132 |
-
|
133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
137 |
tasks.append(
|
138 |
synthesize_speech_line(
|
139 |
client=async_openai_client,
|
140 |
text=line_data["text"],
|
141 |
-
voice=
|
142 |
output_path=output_filename,
|
143 |
model=tts_model,
|
|
|
|
|
144 |
nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
|
145 |
line_index=line_data['id']
|
146 |
)
|
@@ -148,60 +259,29 @@ async def handle_script_processing(
|
|
148 |
|
149 |
synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
|
150 |
|
151 |
-
# Place results into line_audio_files based on original line_data['id'] if possible,
|
152 |
-
# or simply by order of completion if IDs are not perfectly mapping (should not happen with current setup).
|
153 |
-
# For now, assuming `synthesis_results` order matches `parsed_lines` due to `asyncio.gather` preserving order.
|
154 |
for idx, result in enumerate(synthesis_results):
|
155 |
if isinstance(result, Exception):
|
156 |
-
# Log the error, the file will remain None
|
157 |
print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
|
158 |
-
|
159 |
-
elif result is None: # Synthesis skipped (e.g. NSFW) or failed internally
|
160 |
print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
|
161 |
else:
|
162 |
-
# Store the path of the successfully synthesized file
|
163 |
-
# We need to map 'idx' from gather result back to original 'id' if they differ
|
164 |
-
# For now, assume parsed_lines[idx]['id'] is the relevant one.
|
165 |
-
# line_audio_files is already indexed by 'idx' which corresponds to parsed_lines order.
|
166 |
line_audio_files[idx] = result
|
167 |
|
168 |
valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
169 |
|
170 |
if not valid_audio_files:
|
171 |
-
shutil.rmtree(job_audio_path_prefix)
|
172 |
-
return None, None, "Error: No audio files were successfully synthesized.
|
173 |
|
174 |
-
# 1. Create ZIP of individual lines
|
175 |
zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
176 |
with zipfile.ZipFile(zip_filename, 'w') as zf:
|
177 |
for audio_file_path in valid_audio_files:
|
178 |
zf.write(audio_file_path, os.path.basename(audio_file_path))
|
179 |
|
180 |
-
# 2. Create merged MP3
|
181 |
-
# We need to pass the *ordered* list of successfully generated files for merging
|
182 |
-
# `line_audio_files` contains paths or None, in the original script order.
|
183 |
ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
184 |
-
|
185 |
merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
186 |
merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
|
187 |
|
188 |
-
if not merged_output_path:
|
189 |
-
# Merging failed, but we might still have the zip.
|
190 |
-
# Return zip, and None for merged, with an error message.
|
191 |
-
# For simplicity now, let's consider this a partial success if zip is there.
|
192 |
-
# Or, make it an error:
|
193 |
-
# shutil.rmtree(job_audio_path_prefix)
|
194 |
-
# return None, None, "Error: Failed to merge audio files, though individual lines might be available."
|
195 |
-
# Let's return what we have
|
196 |
-
print("Warning: Merged MP3 generation failed. Individual files might still be in ZIP.")
|
197 |
-
# We still return the zip path, and None for merged path
|
198 |
-
|
199 |
-
# Gradio output: individual_lines_zip, merged_dialogue_mp3, status_message
|
200 |
-
# If merged_output_path is None, Gradio will show nothing for that component if its type expects a file.
|
201 |
-
# It's better to return a string path, even if the file doesn't exist, and let Gradio handle it,
|
202 |
-
# or handle it by returning None and ensure the component can take None.
|
203 |
-
# For File output, returning None is fine.
|
204 |
-
|
205 |
final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
|
206 |
if len(valid_audio_files) < len(parsed_lines):
|
207 |
final_status += "Some lines failed or were skipped. "
|
@@ -212,20 +292,6 @@ async def handle_script_processing(
|
|
212 |
else:
|
213 |
final_status += "Outputs generated."
|
214 |
|
215 |
-
# Make copies of the files to a location Gradio can serve if they are in the job_audio_path_prefix
|
216 |
-
# which might be cleaned up. Gradio makes its own copies for File output components.
|
217 |
-
# So, returning paths from job_audio_path_prefix is fine.
|
218 |
-
|
219 |
-
# Return paths for Gradio File components
|
220 |
-
# Gradio will make these downloadable.
|
221 |
-
# If a file doesn't exist or is None, the Gradio component should handle it gracefully (e.g., show nothing).
|
222 |
-
|
223 |
-
# Intentionally DO NOT clean up job_audio_path_prefix here.
|
224 |
-
# Gradio needs access to these files to serve them.
|
225 |
-
# Cleanup should happen at the start of the *next* run, or via a different mechanism
|
226 |
-
# if HF Spaces doesn't clean /tmp periodically.
|
227 |
-
# The current strategy of clearing job_audio_path_prefix at the start of handle_script_processing is good.
|
228 |
-
|
229 |
return zip_filename if os.path.exists(zip_filename) else None, \
|
230 |
merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
|
231 |
final_status
|
@@ -240,44 +306,72 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
|
|
240 |
return "Cost: $0.000000 (No valid lines)"
|
241 |
cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
|
242 |
return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
|
243 |
-
except ValueError as e:
|
244 |
return f"Error: {str(e)}"
|
245 |
except Exception as e:
|
246 |
return f"Error calculating cost: {str(e)}"
|
247 |
|
248 |
# --- Gradio Interface ---
|
249 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
250 |
-
gr.Markdown("# Dialogue Script to Speech Converter")
|
251 |
gr.Markdown(
|
252 |
-
"Convert
|
253 |
-
"Supports `tts-1-hd` and `gpt-4o-mini-tts` from OpenAI. "
|
254 |
-
"Uses a maximum of 2 concurrent TTS requests."
|
255 |
)
|
256 |
if not OPENAI_API_KEY or not async_openai_client:
|
257 |
-
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid.
|
258 |
-
|
259 |
|
260 |
with gr.Row():
|
261 |
with gr.Column(scale=2):
|
262 |
script_input = gr.TextArea(
|
263 |
label="Dialogue Script",
|
264 |
-
placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!
|
265 |
lines=10,
|
266 |
info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
|
267 |
)
|
268 |
with gr.Column(scale=1):
|
269 |
tts_model_dropdown = gr.Dropdown(
|
270 |
-
|
271 |
-
|
272 |
-
speaker_voice_preset_dropdown = gr.Dropdown(
|
273 |
-
SPEAKER_VOICE_MAPPING_PRESETS, label="Speaker Voice Assignment", value=SPEAKER_VOICE_MAPPING_PRESETS[0],
|
274 |
-
info="How voices are assigned to speakers."
|
275 |
)
|
276 |
pause_input = gr.Number(
|
277 |
label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
|
278 |
info="Silence duration in milliseconds between merged lines."
|
279 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
with gr.Row():
|
282 |
calculate_cost_button = gr.Button("Calculate Estimated Cost")
|
283 |
generate_button = gr.Button("Generate Audio Files", variant="primary")
|
@@ -288,8 +382,38 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
288 |
individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
|
289 |
merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
|
290 |
|
291 |
-
status_output = gr.Textbox(label="Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
calculate_cost_button.click(
|
294 |
fn=handle_calculate_cost,
|
295 |
inputs=[script_input, tts_model_dropdown],
|
@@ -298,34 +422,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
298 |
|
299 |
generate_button.click(
|
300 |
fn=handle_script_processing,
|
301 |
-
inputs=[
|
|
|
|
|
|
|
|
|
302 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
303 |
-
# api_name="generate_audio" # if you want to expose an API endpoint
|
304 |
)
|
305 |
|
306 |
gr.Markdown("## Examples")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
gr.Examples(
|
308 |
examples=[
|
309 |
-
[
|
310 |
-
|
311 |
-
|
312 |
-
],
|
313 |
-
[
|
314 |
-
"[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain.\n[Captain Eva] Excellent. Maintain course for Kepler-186f.",
|
315 |
-
"gpt-4o-mini-tts", 600, SPEAKER_VOICE_MAPPING_PRESETS[2]
|
316 |
-
],
|
317 |
-
[
|
318 |
-
"A single line of narration, no speaker tag initially. This will be auto-assigned to 'Narrator'.",
|
319 |
-
"tts-1", 0, SPEAKER_VOICE_MAPPING_PRESETS[0] # tts-1 to show it works too
|
320 |
-
]
|
321 |
],
|
322 |
-
inputs=[
|
323 |
-
|
324 |
-
|
325 |
-
|
|
|
|
|
|
|
|
|
326 |
)
|
327 |
|
328 |
if __name__ == "__main__":
|
329 |
-
if os.name == 'nt':
|
330 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
331 |
demo.launch()
|
|
|
5 |
import shutil
|
6 |
import zipfile
|
7 |
import random
|
8 |
+
import json
|
9 |
from openai import AsyncOpenAI
|
10 |
|
11 |
from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
|
12 |
+
from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES # Renamed for clarity
|
13 |
from utils.merge_audio import merge_mp3_files
|
14 |
|
15 |
# --- Configuration ---
|
16 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
17 |
+
NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
|
18 |
+
MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
|
19 |
|
20 |
# Ensure API key is available
|
21 |
if not OPENAI_API_KEY:
|
|
|
22 |
try:
|
23 |
from huggingface_hub import HfApi
|
24 |
api = HfApi()
|
|
|
30 |
MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
|
31 |
except Exception as e:
|
32 |
print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
|
|
|
|
|
33 |
|
|
|
34 |
async_openai_client = None
|
35 |
if OPENAI_API_KEY:
|
36 |
async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
|
37 |
else:
|
38 |
+
print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
|
|
|
39 |
|
40 |
+
# TTS Models available in the UI
|
41 |
+
TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
|
42 |
+
if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE: # Ensure default is valid
|
43 |
+
MODEL_DEFAULT = "tts-1-hd"
|
44 |
|
|
|
|
|
45 |
|
46 |
+
SPEAKER_CONFIG_METHODS = [
|
47 |
+
"Single Voice (Global)",
|
48 |
+
"Random per Speaker",
|
49 |
+
"A/B Round Robin",
|
50 |
+
"Detailed Configuration (JSON)"
|
51 |
+
]
|
52 |
+
DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
|
53 |
|
54 |
+
# For UI elements that need the list of voices
|
55 |
+
APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
|
|
|
|
|
56 |
|
57 |
+
|
58 |
+
# Global store for parsed speaker configurations
|
59 |
+
_speaker_config_cache = {}
|
60 |
+
|
61 |
+
def parse_detailed_speaker_config(json_text, parsed_script_lines):
|
62 |
+
""" Parses the JSON config string into a speaker map. """
|
63 |
+
config_map = {}
|
64 |
+
default_voice = APP_AVAILABLE_VOICES[0]
|
65 |
+
try:
|
66 |
+
if not json_text.strip():
|
67 |
+
return {} # Empty config means rely on global or other fallbacks
|
68 |
+
|
69 |
+
config_list = json.loads(json_text)
|
70 |
+
if not isinstance(config_list, list):
|
71 |
+
raise ValueError("JSON config must be a list of speaker objects.")
|
72 |
+
|
73 |
+
for item in config_list:
|
74 |
+
if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
|
75 |
+
# Log warning or skip malformed item
|
76 |
+
print(f"Skipping malformed item in JSON config: {item}")
|
77 |
+
continue
|
78 |
+
if item["voice"] not in APP_AVAILABLE_VOICES:
|
79 |
+
print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
|
80 |
+
item["voice"] = default_voice
|
81 |
+
|
82 |
+
# Validate speed if present
|
83 |
+
if "speed" in item:
|
84 |
+
try:
|
85 |
+
item["speed"] = float(item["speed"])
|
86 |
+
if not (0.25 <= item["speed"] <= 4.0):
|
87 |
+
print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping or defaulting.")
|
88 |
+
item["speed"] = max(0.25, min(item["speed"], 4.0)) # Clamp
|
89 |
+
except ValueError:
|
90 |
+
print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
|
91 |
+
# Let it be None or remove, so global/default speed applies
|
92 |
+
item.pop("speed", None)
|
93 |
+
|
94 |
+
|
95 |
+
config_map[item["speaker"]] = {
|
96 |
+
"voice": item["voice"],
|
97 |
+
"speed": item.get("speed"), # Will be None if not present or invalid
|
98 |
+
"instructions": item.get("instructions") # Will be None if not present
|
99 |
+
}
|
100 |
+
return config_map
|
101 |
+
except json.JSONDecodeError as e:
|
102 |
+
raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
|
103 |
+
except ValueError as e: # Catch our own ValueErrors
|
104 |
+
raise e # Re-raise
|
105 |
+
except Exception as e: # Catch any other unexpected errors during parsing
|
106 |
+
raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
|
107 |
+
|
108 |
+
|
109 |
+
def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
|
110 |
+
global_selected_voice, detailed_config_map):
|
111 |
+
global _speaker_config_cache # This cache helps maintain consistency for "Random" and "A/B" within a run
|
112 |
+
|
113 |
+
# If method changed or cache is for a different set of speakers, reset it.
|
114 |
+
# A more robust cache key might involve hashing unique_script_speakers.
|
115 |
+
if _speaker_config_cache.get("__method") != speaker_config_method or \
|
116 |
+
_speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
|
117 |
+
_speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
|
118 |
+
|
119 |
+
base_config = {"voice": APP_AVAILABLE_VOICES[0], "speed": None, "instructions": None}
|
120 |
+
|
121 |
+
if speaker_config_method == "Single Voice (Global)":
|
122 |
+
base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
|
123 |
+
return base_config
|
124 |
|
125 |
+
if speaker_config_method == "Detailed Configuration (JSON)":
|
126 |
+
if speaker_name in detailed_config_map:
|
127 |
+
# Merge with base_config to ensure all keys are present if some are optional in JSON
|
128 |
+
# JSON values take precedence
|
129 |
+
speaker_specific = detailed_config_map[speaker_name]
|
130 |
+
return {
|
131 |
+
"voice": speaker_specific.get("voice", base_config["voice"]),
|
132 |
+
"speed": speaker_specific.get("speed"), # Allow None to use global
|
133 |
+
"instructions": speaker_specific.get("instructions") # Allow None to use global
|
134 |
+
}
|
135 |
+
else: # Fallback for speakers in script but not in JSON map
|
136 |
+
# Could use a default voice, or a cycling voice for unmapped speakers
|
137 |
+
# For now, let's use the first voice as a simple fallback.
|
138 |
+
print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
|
139 |
+
return base_config
|
140 |
+
|
141 |
+
|
142 |
+
# For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
|
143 |
+
if speaker_name not in _speaker_config_cache:
|
144 |
+
if speaker_config_method == "Random per Speaker":
|
145 |
+
# Assign a random voice if not already cached for this run
|
146 |
+
available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
|
147 |
+
# Ensure all unique speakers get an assignment before reusing voices from cache build
|
148 |
+
if not _speaker_config_cache.get("__all_assigned_random"):
|
149 |
+
for i, spk_unique in enumerate(unique_script_speakers):
|
150 |
+
if spk_unique not in _speaker_config_cache:
|
151 |
+
_speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
|
152 |
+
_speaker_config_cache["__all_assigned_random"] = True
|
153 |
+
|
154 |
+
# If somehow still not found (should not happen if pre-populated)
|
155 |
+
if speaker_name not in _speaker_config_cache:
|
156 |
+
_speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
|
157 |
+
|
158 |
+
elif speaker_config_method == "A/B Round Robin":
|
159 |
+
# Assign voices in order if not already cached
|
160 |
+
if not _speaker_config_cache.get("__all_assigned_ab"):
|
161 |
+
for i, spk_unique in enumerate(unique_script_speakers):
|
162 |
+
if spk_unique not in _speaker_config_cache:
|
163 |
+
_speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
|
164 |
+
_speaker_config_cache["__all_assigned_ab"] = True
|
165 |
+
|
166 |
+
if speaker_name not in _speaker_config_cache: # Fallback, should be populated
|
167 |
+
speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
|
168 |
+
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
|
169 |
+
else: # Should not happen
|
170 |
+
_speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
|
171 |
+
|
172 |
+
# Return only voice for Random/AB, speed/instructions will be from global UI inputs
|
173 |
+
cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
|
174 |
+
return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
|
175 |
+
|
176 |
|
177 |
async def handle_script_processing(
|
178 |
dialogue_script: str,
|
179 |
tts_model: str,
|
180 |
pause_ms: int,
|
181 |
+
speaker_config_method: str,
|
182 |
+
global_voice_selection: str, # From dropdown if "Single Voice (Global)"
|
183 |
+
detailed_speaker_json: str, # From JSON input
|
184 |
+
global_speed: float,
|
185 |
+
global_instructions: str,
|
186 |
progress=gr.Progress(track_tqdm=True)
|
187 |
):
|
188 |
+
global _speaker_config_cache
|
189 |
+
_speaker_config_cache = {} # Reset speaker config cache for each new run
|
190 |
|
191 |
if not OPENAI_API_KEY or not async_openai_client:
|
192 |
return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
|
|
|
193 |
if not dialogue_script.strip():
|
194 |
return None, None, "Error: Script is empty."
|
195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
|
|
|
|
|
197 |
if os.path.exists(job_audio_path_prefix):
|
198 |
shutil.rmtree(job_audio_path_prefix)
|
199 |
os.makedirs(job_audio_path_prefix, exist_ok=True)
|
|
|
201 |
try:
|
202 |
parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
|
203 |
if not parsed_lines:
|
204 |
+
return None, None, "Error: No valid dialogue lines found."
|
205 |
+
except ValueError as e:
|
|
|
|
|
206 |
return None, None, f"Script parsing error: {str(e)}"
|
207 |
|
208 |
+
unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
|
209 |
+
parsed_detailed_config_map = {}
|
210 |
+
if speaker_config_method == "Detailed Configuration (JSON)":
|
211 |
+
try:
|
212 |
+
parsed_detailed_config_map = parse_detailed_speaker_config(detailed_speaker_json, parsed_lines)
|
213 |
+
except ValueError as e:
|
214 |
+
return None, None, f"Configuration Error: {str(e)}"
|
215 |
|
216 |
+
tasks = []
|
217 |
+
line_audio_files = [None] * len(parsed_lines)
|
|
|
|
|
|
|
218 |
|
219 |
for i, line_data in enumerate(parsed_lines):
|
220 |
+
speaker_name = line_data["speaker"]
|
221 |
+
|
222 |
+
# Get base config (primarily voice) based on method
|
223 |
+
# For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
|
224 |
+
# For "Single Voice", it returns the globally selected voice.
|
225 |
+
# For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
|
226 |
+
speaker_base_cfg = get_config_for_speaker(
|
227 |
+
speaker_name, speaker_config_method, unique_speakers,
|
228 |
+
global_voice_selection, parsed_detailed_config_map
|
229 |
+
)
|
230 |
+
|
231 |
+
line_voice = speaker_base_cfg["voice"]
|
232 |
|
233 |
+
# Determine effective speed and instructions
|
234 |
+
# Priority: Per-speaker from JSON > Global UI > API Default (1.0 for speed, None for instructions)
|
235 |
+
effective_speed = global_speed # Start with global
|
236 |
+
if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
|
237 |
+
effective_speed = speaker_base_cfg["speed"]
|
238 |
+
|
239 |
+
effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
|
240 |
+
if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip(): # If JSON provided instructions
|
241 |
+
effective_instructions = speaker_base_cfg["instructions"]
|
242 |
+
|
243 |
+
output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
|
244 |
+
progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
|
245 |
|
246 |
tasks.append(
|
247 |
synthesize_speech_line(
|
248 |
client=async_openai_client,
|
249 |
text=line_data["text"],
|
250 |
+
voice=line_voice,
|
251 |
output_path=output_filename,
|
252 |
model=tts_model,
|
253 |
+
speed=effective_speed,
|
254 |
+
instructions=effective_instructions,
|
255 |
nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
|
256 |
line_index=line_data['id']
|
257 |
)
|
|
|
259 |
|
260 |
synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
|
261 |
|
|
|
|
|
|
|
262 |
for idx, result in enumerate(synthesis_results):
|
263 |
if isinstance(result, Exception):
|
|
|
264 |
print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
|
265 |
+
elif result is None:
|
|
|
266 |
print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
|
267 |
else:
|
|
|
|
|
|
|
|
|
268 |
line_audio_files[idx] = result
|
269 |
|
270 |
valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
271 |
|
272 |
if not valid_audio_files:
|
273 |
+
shutil.rmtree(job_audio_path_prefix)
|
274 |
+
return None, None, "Error: No audio files were successfully synthesized."
|
275 |
|
|
|
276 |
zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
|
277 |
with zipfile.ZipFile(zip_filename, 'w') as zf:
|
278 |
for audio_file_path in valid_audio_files:
|
279 |
zf.write(audio_file_path, os.path.basename(audio_file_path))
|
280 |
|
|
|
|
|
|
|
281 |
ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
|
|
|
282 |
merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
|
283 |
merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
|
286 |
if len(valid_audio_files) < len(parsed_lines):
|
287 |
final_status += "Some lines failed or were skipped. "
|
|
|
292 |
else:
|
293 |
final_status += "Outputs generated."
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
return zip_filename if os.path.exists(zip_filename) else None, \
|
296 |
merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
|
297 |
final_status
|
|
|
306 |
return "Cost: $0.000000 (No valid lines)"
|
307 |
cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
|
308 |
return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
|
309 |
+
except ValueError as e:
|
310 |
return f"Error: {str(e)}"
|
311 |
except Exception as e:
|
312 |
return f"Error calculating cost: {str(e)}"
|
313 |
|
314 |
# --- Gradio Interface ---
|
315 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
316 |
+
gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
|
317 |
gr.Markdown(
|
318 |
+
"Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
|
|
|
|
|
319 |
)
|
320 |
if not OPENAI_API_KEY or not async_openai_client:
|
321 |
+
gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. Speech synthesis will fail.</h3>")
|
|
|
322 |
|
323 |
with gr.Row():
|
324 |
with gr.Column(scale=2):
|
325 |
script_input = gr.TextArea(
|
326 |
label="Dialogue Script",
|
327 |
+
placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
|
328 |
lines=10,
|
329 |
info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
|
330 |
)
|
331 |
with gr.Column(scale=1):
|
332 |
tts_model_dropdown = gr.Dropdown(
|
333 |
+
TTS_MODELS_AVAILABLE, label="OpenAI TTS Model", value=MODEL_DEFAULT,
|
334 |
+
info="Select TTS model. Affects available controls below."
|
|
|
|
|
|
|
335 |
)
|
336 |
pause_input = gr.Number(
|
337 |
label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
|
338 |
info="Silence duration in milliseconds between merged lines."
|
339 |
)
|
340 |
+
# Model-specific global controls
|
341 |
+
global_speed_input = gr.Slider(
|
342 |
+
minimum=0.25, maximum=4.0, value=1.0, step=0.05,
|
343 |
+
label="Global Speech Speed (for tts-1/tts-1-hd)",
|
344 |
+
visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
|
345 |
+
interactive=True
|
346 |
+
)
|
347 |
+
global_instructions_input = gr.Textbox(
|
348 |
+
label="Global Voice Instructions (for gpt-4o-mini-tts)",
|
349 |
+
placeholder="e.g., Speak in a calm, reassuring tone.",
|
350 |
+
visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
|
351 |
+
interactive=True, lines=2
|
352 |
+
)
|
353 |
|
354 |
+
gr.Markdown("### Speaker Configuration")
|
355 |
+
with gr.Row():
|
356 |
+
speaker_config_method_dropdown = gr.Dropdown(
|
357 |
+
SPEAKER_CONFIG_METHODS, label="Speaker Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
|
358 |
+
)
|
359 |
+
global_voice_dropdown = gr.Dropdown(
|
360 |
+
APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
|
361 |
+
value=APP_AVAILABLE_VOICES[0],
|
362 |
+
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
|
363 |
+
interactive=True
|
364 |
+
)
|
365 |
+
|
366 |
+
detailed_speaker_config_input = gr.Code(
|
367 |
+
label="Detailed Speaker Configuration (JSON)",
|
368 |
+
language="json",
|
369 |
+
lines=7,
|
370 |
+
placeholder='[\n {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},\n {"speaker": "Bob", "voice": "echo"},\n {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}\n]',
|
371 |
+
visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)"),
|
372 |
+
info="Define voice, and optionally model-compatible speed/instructions per speaker."
|
373 |
+
)
|
374 |
+
|
375 |
with gr.Row():
|
376 |
calculate_cost_button = gr.Button("Calculate Estimated Cost")
|
377 |
generate_button = gr.Button("Generate Audio Files", variant="primary")
|
|
|
382 |
individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
|
383 |
merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
|
384 |
|
385 |
+
status_output = gr.Textbox(label="Status", interactive=False, lines=2)
|
386 |
+
|
387 |
+
# --- Event Handlers for UI Interactivity ---
|
388 |
+
def update_model_specific_controls_visibility(selected_model):
|
389 |
+
is_tts_1_family = selected_model in ["tts-1", "tts-1-hd"]
|
390 |
+
is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
|
391 |
+
return {
|
392 |
+
global_speed_input: gr.update(visible=is_tts_1_family, interactive=is_tts_1_family),
|
393 |
+
global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
|
394 |
+
}
|
395 |
+
|
396 |
+
tts_model_dropdown.change(
|
397 |
+
fn=update_model_specific_controls_visibility,
|
398 |
+
inputs=[tts_model_dropdown],
|
399 |
+
outputs=[global_speed_input, global_instructions_input]
|
400 |
+
)
|
401 |
|
402 |
+
def update_speaker_config_visibility(config_method):
|
403 |
+
is_single_voice = (config_method == "Single Voice (Global)")
|
404 |
+
is_detailed_json = (config_method == "Detailed Configuration (JSON)")
|
405 |
+
return {
|
406 |
+
global_voice_dropdown: gr.update(visible=is_single_voice, interactive=is_single_voice),
|
407 |
+
detailed_speaker_config_input: gr.update(visible=is_detailed_json, interactive=is_detailed_json)
|
408 |
+
}
|
409 |
+
|
410 |
+
speaker_config_method_dropdown.change(
|
411 |
+
fn=update_speaker_config_visibility,
|
412 |
+
inputs=[speaker_config_method_dropdown],
|
413 |
+
outputs=[global_voice_dropdown, detailed_speaker_config_input]
|
414 |
+
)
|
415 |
+
|
416 |
+
# --- Main Button Actions ---
|
417 |
calculate_cost_button.click(
|
418 |
fn=handle_calculate_cost,
|
419 |
inputs=[script_input, tts_model_dropdown],
|
|
|
422 |
|
423 |
generate_button.click(
|
424 |
fn=handle_script_processing,
|
425 |
+
inputs=[
|
426 |
+
script_input, tts_model_dropdown, pause_input,
|
427 |
+
speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
|
428 |
+
global_speed_input, global_instructions_input
|
429 |
+
],
|
430 |
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
|
|
431 |
)
|
432 |
|
433 |
gr.Markdown("## Examples")
|
434 |
+
example_script_1 = "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more."
|
435 |
+
example_json_1 = """
|
436 |
+
[
|
437 |
+
{"speaker": "Alice", "voice": "nova", "instructions": "sound curious"},
|
438 |
+
{"speaker": "Bob", "voice": "echo", "speed": 0.9}
|
439 |
+
]
|
440 |
+
""".strip()
|
441 |
+
|
442 |
+
example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
|
443 |
+
example_json_2 = """
|
444 |
+
[
|
445 |
+
{"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
|
446 |
+
{"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
|
447 |
+
]
|
448 |
+
""".strip()
|
449 |
+
|
450 |
+
|
451 |
gr.Examples(
|
452 |
examples=[
|
453 |
+
[example_script_1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_1, 1.0, "Speak naturally."],
|
454 |
+
[example_script_2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_2, 1.1, ""],
|
455 |
+
["[Narrator] A single line, using global settings.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
],
|
457 |
+
inputs=[
|
458 |
+
script_input, tts_model_dropdown, pause_input,
|
459 |
+
speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
|
460 |
+
global_speed_input, global_instructions_input
|
461 |
+
],
|
462 |
+
outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
|
463 |
+
fn=handle_script_processing,
|
464 |
+
cache_examples=False,
|
465 |
)
|
466 |
|
467 |
if __name__ == "__main__":
|
468 |
+
if os.name == 'nt':
|
469 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
470 |
demo.launch()
|
utils/openai_tts.py
CHANGED
@@ -4,8 +4,8 @@ import time
|
|
4 |
from openai import AsyncOpenAI, OpenAIError, RateLimitError
|
5 |
import httpx # For NSFW check
|
6 |
|
7 |
-
#
|
8 |
-
OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
|
9 |
|
10 |
# Concurrency limiter
|
11 |
MAX_CONCURRENT_REQUESTS = 2
|
@@ -21,33 +21,27 @@ async def is_content_safe(text: str, api_url_template: str | None) -> bool:
|
|
21 |
Returns True if safe or if API URL is not provided, False if unsafe.
|
22 |
"""
|
23 |
if not api_url_template:
|
24 |
-
return True
|
25 |
|
26 |
-
# Basic template check - expecting {text} placeholder
|
27 |
if "{text}" not in api_url_template:
|
28 |
print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
|
29 |
return True
|
30 |
|
31 |
try:
|
32 |
-
encoded_text = httpx.utils.quote(text)
|
33 |
url = api_url_template.format(text=encoded_text)
|
34 |
|
35 |
async with httpx.AsyncClient() as client:
|
36 |
response = await client.get(url, timeout=10.0)
|
37 |
|
38 |
-
# Assuming 200 OK means "safe" and other statuses might mean "unsafe" or error
|
39 |
-
# This logic might need adjustment based on the specific API's response codes
|
40 |
if response.status_code == 200:
|
41 |
-
# Further, check response content if API specifies (e.g., JSON payload)
|
42 |
-
# For a generic template, we'll assume 200 means safe.
|
43 |
-
# Example: response_json = response.json(); return response_json.get("is_safe", False)
|
44 |
return True
|
45 |
else:
|
46 |
print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
|
47 |
return False
|
48 |
except httpx.RequestError as e:
|
49 |
print(f"NSFW Check: API request error: {e}")
|
50 |
-
return False
|
51 |
except Exception as e:
|
52 |
print(f"NSFW Check: An unexpected error occurred: {e}")
|
53 |
return False
|
@@ -59,36 +53,49 @@ async def synthesize_speech_line(
|
|
59 |
voice: str,
|
60 |
output_path: str,
|
61 |
model: str = "tts-1-hd",
|
|
|
|
|
62 |
nsfw_api_url_template: str | None = None,
|
63 |
-
line_index: int = -1
|
64 |
) -> str | None:
|
65 |
"""
|
66 |
Synthesizes a single line of text to speech using OpenAI TTS.
|
|
|
67 |
Retries on RateLimitError with exponential backoff.
|
68 |
Returns the output_path if successful, None otherwise.
|
69 |
"""
|
70 |
if nsfw_api_url_template:
|
71 |
if not await is_content_safe(text, nsfw_api_url_template):
|
72 |
print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
|
73 |
-
# Create a silent MP3 or handle as needed; here we skip and return None
|
74 |
-
# This could also raise an exception to halt processing.
|
75 |
return None
|
76 |
|
77 |
-
|
78 |
current_retry = 0
|
79 |
backoff_seconds = INITIAL_BACKOFF_SECONDS
|
80 |
|
81 |
-
async with semaphore:
|
82 |
while current_retry < MAX_RETRIES:
|
83 |
try:
|
84 |
-
|
85 |
-
model
|
86 |
-
voice
|
87 |
-
input
|
88 |
-
response_format
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
await response.astream_to_file(output_path)
|
91 |
-
# print(f"Successfully synthesized line {line_index if line_index !=-1 else ''} to {output_path} using voice {voice}")
|
92 |
return output_path
|
93 |
except RateLimitError as e:
|
94 |
current_retry += 1
|
@@ -97,19 +104,16 @@ async def synthesize_speech_line(
|
|
97 |
return None
|
98 |
print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
|
99 |
await asyncio.sleep(backoff_seconds)
|
100 |
-
backoff_seconds *= 2
|
101 |
except OpenAIError as e:
|
102 |
print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
|
103 |
-
return None
|
104 |
except Exception as e:
|
105 |
print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
|
106 |
return None
|
107 |
-
return None
|
108 |
-
|
109 |
|
110 |
if __name__ == '__main__':
|
111 |
-
# This is a basic test and requires OPENAI_API_KEY to be set in environment
|
112 |
-
# and a temporary directory to exist or be created.
|
113 |
async def main_test():
|
114 |
api_key = os.getenv("OPENAI_API_KEY")
|
115 |
if not api_key:
|
@@ -119,22 +123,37 @@ if __name__ == '__main__':
|
|
119 |
client = AsyncOpenAI(api_key=api_key)
|
120 |
|
121 |
test_lines = [
|
122 |
-
{"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice."},
|
123 |
-
{"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice."},
|
124 |
-
{"id": 2, "speaker": "Alice", "text": "A short reply."},
|
125 |
-
{"id": 3, "speaker": "Charlie", "text": "Charlie here,
|
126 |
]
|
127 |
|
128 |
-
temp_dir = "
|
129 |
os.makedirs(temp_dir, exist_ok=True)
|
130 |
|
131 |
tasks = []
|
132 |
for i, line_data in enumerate(test_lines):
|
133 |
-
|
134 |
-
|
135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
tasks.append(
|
137 |
-
synthesize_speech_line(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
)
|
139 |
|
140 |
results = await asyncio.gather(*tasks)
|
@@ -143,14 +162,7 @@ if __name__ == '__main__':
|
|
143 |
print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
|
144 |
for f_path in successful_files:
|
145 |
print(f" - {f_path}")
|
146 |
-
|
147 |
-
# Clean up test files (optional)
|
148 |
-
# for f_path in successful_files:
|
149 |
-
# os.remove(f_path)
|
150 |
-
# if not os.listdir(temp_dir): # only remove if empty
|
151 |
-
# os.rmdir(temp_dir)
|
152 |
-
|
153 |
|
154 |
-
if os.name == 'nt':
|
155 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
156 |
asyncio.run(main_test())
|
|
|
4 |
from openai import AsyncOpenAI, OpenAIError, RateLimitError
|
5 |
import httpx # For NSFW check
|
6 |
|
7 |
+
# Expanded list of voices based on recent OpenAI documentation
|
8 |
+
OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer', 'ash', 'ballad', 'coral', 'sage', 'verse']
|
9 |
|
10 |
# Concurrency limiter
|
11 |
MAX_CONCURRENT_REQUESTS = 2
|
|
|
21 |
Returns True if safe or if API URL is not provided, False if unsafe.
|
22 |
"""
|
23 |
if not api_url_template:
|
24 |
+
return True
|
25 |
|
|
|
26 |
if "{text}" not in api_url_template:
|
27 |
print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
|
28 |
return True
|
29 |
|
30 |
try:
|
31 |
+
encoded_text = httpx.utils.quote(text)
|
32 |
url = api_url_template.format(text=encoded_text)
|
33 |
|
34 |
async with httpx.AsyncClient() as client:
|
35 |
response = await client.get(url, timeout=10.0)
|
36 |
|
|
|
|
|
37 |
if response.status_code == 200:
|
|
|
|
|
|
|
38 |
return True
|
39 |
else:
|
40 |
print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
|
41 |
return False
|
42 |
except httpx.RequestError as e:
|
43 |
print(f"NSFW Check: API request error: {e}")
|
44 |
+
return False
|
45 |
except Exception as e:
|
46 |
print(f"NSFW Check: An unexpected error occurred: {e}")
|
47 |
return False
|
|
|
53 |
voice: str,
|
54 |
output_path: str,
|
55 |
model: str = "tts-1-hd",
|
56 |
+
speed: float = 1.0,
|
57 |
+
instructions: str | None = None,
|
58 |
nsfw_api_url_template: str | None = None,
|
59 |
+
line_index: int = -1
|
60 |
) -> str | None:
|
61 |
"""
|
62 |
Synthesizes a single line of text to speech using OpenAI TTS.
|
63 |
+
Includes speed and instructions parameters based on model compatibility.
|
64 |
Retries on RateLimitError with exponential backoff.
|
65 |
Returns the output_path if successful, None otherwise.
|
66 |
"""
|
67 |
if nsfw_api_url_template:
|
68 |
if not await is_content_safe(text, nsfw_api_url_template):
|
69 |
print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
|
|
|
|
|
70 |
return None
|
71 |
|
|
|
72 |
current_retry = 0
|
73 |
backoff_seconds = INITIAL_BACKOFF_SECONDS
|
74 |
|
75 |
+
async with semaphore:
|
76 |
while current_retry < MAX_RETRIES:
|
77 |
try:
|
78 |
+
request_params = {
|
79 |
+
"model": model,
|
80 |
+
"voice": voice,
|
81 |
+
"input": text,
|
82 |
+
"response_format": "mp3"
|
83 |
+
}
|
84 |
+
|
85 |
+
# Add speed if model supports it and speed is not default
|
86 |
+
if model in ["tts-1", "tts-1-hd"]:
|
87 |
+
if speed is not None and speed != 1.0: # OpenAI default is 1.0
|
88 |
+
# Ensure speed is within valid range for safety, though UI should also constrain this
|
89 |
+
clamped_speed = max(0.25, min(speed, 4.0))
|
90 |
+
request_params["speed"] = clamped_speed
|
91 |
+
|
92 |
+
# Add instructions if model supports it and instructions are provided
|
93 |
+
# Assuming gpt-4o-mini-tts supports it, and tts-1/tts-1-hd do not.
|
94 |
+
if model not in ["tts-1", "tts-1-hd"] and instructions: # Example: gpt-4o-mini-tts
|
95 |
+
request_params["instructions"] = instructions
|
96 |
+
|
97 |
+
response = await client.audio.speech.create(**request_params)
|
98 |
await response.astream_to_file(output_path)
|
|
|
99 |
return output_path
|
100 |
except RateLimitError as e:
|
101 |
current_retry += 1
|
|
|
104 |
return None
|
105 |
print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
|
106 |
await asyncio.sleep(backoff_seconds)
|
107 |
+
backoff_seconds *= 2
|
108 |
except OpenAIError as e:
|
109 |
print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
|
110 |
+
return None
|
111 |
except Exception as e:
|
112 |
print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
|
113 |
return None
|
114 |
+
return None
|
|
|
115 |
|
116 |
if __name__ == '__main__':
|
|
|
|
|
117 |
async def main_test():
|
118 |
api_key = os.getenv("OPENAI_API_KEY")
|
119 |
if not api_key:
|
|
|
123 |
client = AsyncOpenAI(api_key=api_key)
|
124 |
|
125 |
test_lines = [
|
126 |
+
{"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice, spoken quickly."},
|
127 |
+
{"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice with instructions.", "instructions": "Speak in a deep, resonant voice."},
|
128 |
+
{"id": 2, "speaker": "Alice", "text": "A short reply, spoken slowly.", "speed": 0.8},
|
129 |
+
{"id": 3, "speaker": "Charlie", "text": "Charlie here, normal speed."}
|
130 |
]
|
131 |
|
132 |
+
temp_dir = "test_audio_output_enhanced"
|
133 |
os.makedirs(temp_dir, exist_ok=True)
|
134 |
|
135 |
tasks = []
|
136 |
for i, line_data in enumerate(test_lines):
|
137 |
+
# Test with specific models to check param compatibility
|
138 |
+
# For Alice (speed): tts-1-hd. For Bob (instructions): gpt-4o-mini-tts
|
139 |
+
current_model = "tts-1-hd"
|
140 |
+
if "instructions" in line_data:
|
141 |
+
current_model = "gpt-4o-mini-tts" # Example, ensure this model is available for your key
|
142 |
+
|
143 |
+
voice = OPENAI_VOICES[i % len(OPENAI_VOICES)]
|
144 |
+
output_file = os.path.join(temp_dir, f"line_{line_data['id']}_{current_model}.mp3")
|
145 |
+
|
146 |
tasks.append(
|
147 |
+
synthesize_speech_line(
|
148 |
+
client,
|
149 |
+
line_data["text"],
|
150 |
+
voice,
|
151 |
+
output_file,
|
152 |
+
model=current_model,
|
153 |
+
speed=line_data.get("speed", 1.0),
|
154 |
+
instructions=line_data.get("instructions"),
|
155 |
+
line_index=line_data['id']
|
156 |
+
)
|
157 |
)
|
158 |
|
159 |
results = await asyncio.gather(*tasks)
|
|
|
162 |
print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
|
163 |
for f_path in successful_files:
|
164 |
print(f" - {f_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
if os.name == 'nt':
|
167 |
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
168 |
asyncio.run(main_test())
|