Spaces:

abocha
/

esl-dialogue-tts

Running

App Files Files Community

abocha commited on May 7

Commit

5c85d81

1 Parent(s): b26f1ff

speed, instructions, granular voice change, tts-1 avainlable

Browse files

Files changed (2) hide show

app.py +290 -151
utils/openai_tts.py +58 -46

app.py CHANGED Viewed

@@ -5,20 +5,20 @@ import tempfile
 import shutil
 import zipfile
 import random
 from openai import AsyncOpenAI
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
-from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES
 from utils.merge_audio import merge_mp3_files
 # --- Configuration ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE") # e.g., "https://api.example.com/nsfw-check?text={text}"
-MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd") # Default model
 # Ensure API key is available
 if not OPENAI_API_KEY:
-    # Try to read from Hugging Face secrets if running in a Space
     try:
         from huggingface_hub import HfApi
         api = HfApi()
@@ -30,82 +30,170 @@ if not OPENAI_API_KEY:
             MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
     except Exception as e:
         print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
-        # Potentially raise an error or disable functionality if key is essential
-        # For now, we'll let it proceed, and OpenAI client init will fail later if key is still None.
-# Initialize OpenAI client if key is found
 async_openai_client = None
 if OPENAI_API_KEY:
     async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
 else:
-    # This will be displayed in the UI if the API key is missing
-    print("ERROR: OPENAI_API_KEY secret is not set. The application will not function.")
-SPEAKER_VOICE_MAPPING_PRESETS = ["Single voice (alloy)", "Random per Speaker", "A/B (alloy, echo, ...)"]
-TTS_MODELS = ["tts-1-hd", "gpt-4o-mini-tts"]
-# Global store for speaker_to_voice mapping to persist across Gradio calls if needed
-# (though for this app, it's recalculated each time based on preset)
-_speaker_voice_map_cache = {}
-def get_voice_for_speaker(speaker, preset, parsed_lines):
-    global _speaker_voice_map_cache
-    unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
-    if preset == "Single voice (alloy)":
-        return OPENAI_VOICES[0] # Default to 'alloy' or first available
-    if preset == "Random per Speaker":
-        if speaker not in _speaker_voice_map_cache or _speaker_voice_map_cache.get("__preset") != preset:
-            # Re-initialize map if preset changed or speaker not found
-            _speaker_voice_map_cache = {"__preset": preset}
-            # Assign a random voice to each unique speaker
-            available_voices_shuffled = random.sample(OPENAI_VOICES, len(OPENAI_VOICES))
-            for i, spk in enumerate(unique_speakers):
-                _speaker_voice_map_cache[spk] = available_voices_shuffled[i % len(available_voices_shuffled)]
-        return _speaker_voice_map_cache.get(speaker, OPENAI_VOICES[0]) # Fallback
-    if preset == "A/B (alloy, echo, ...)":
-        if speaker not in _speaker_voice_map_cache or _speaker_voice_map_cache.get("__preset") != preset:
-            _speaker_voice_map_cache = {"__preset": preset}
-            for i, spk in enumerate(unique_speakers):
-                _speaker_voice_map_cache[spk] = OPENAI_VOICES[i % len(OPENAI_VOICES)]
-        return _speaker_voice_map_cache.get(speaker, OPENAI_VOICES[0]) # Fallback
-    return OPENAI_VOICES[0] # Default fallback
 async def handle_script_processing(
     dialogue_script: str,
     tts_model: str,
     pause_ms: int,
-    speaker_voice_preset: str,
     progress=gr.Progress(track_tqdm=True)
     ):
-    global _speaker_voice_map_cache
-    _speaker_voice_map_cache = {} # Reset speaker voice map for each new run
     if not OPENAI_API_KEY or not async_openai_client:
         return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
     if not dialogue_script.strip():
         return None, None, "Error: Script is empty."
-    # Ensure /tmp exists (it should on HF Spaces)
-    os.makedirs("/tmp", exist_ok=True)
-    # Create a unique temporary directory for this job's files
-    # This helps in cleaning up and avoiding conflicts if /tmp is shared/persistent
-    # temp_job_dir = tempfile.mkdtemp(dir="/tmp", prefix="dialogue_tts_")
-    # Using a fixed sub-directory in /tmp for simplicity for now, and clearing it.
-    # For more robust multi-user on same instance (not free tier concern), mkdtemp is better.
-    # Path for this job's audio files within /tmp
-    # Using a more specific path for cleanup
     job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
-    # Clear previous job's files from the specific prefix directory
     if os.path.exists(job_audio_path_prefix):
         shutil.rmtree(job_audio_path_prefix)
     os.makedirs(job_audio_path_prefix, exist_ok=True)
@@ -113,34 +201,57 @@ async def handle_script_processing(
     try:
         parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
         if not parsed_lines:
-            # shutil.rmtree(job_audio_path_prefix) # Clean up
-            return None, None, "Error: No valid dialogue lines found in the script."
-    except ValueError as e: # Handles MAX_SCRIPT_LENGTH
-        # shutil.rmtree(job_audio_path_prefix) # Clean up
         return None, None, f"Script parsing error: {str(e)}"
-    tasks = []
-    line_audio_files = [None] * len(parsed_lines) # To store paths in original order
-    # Determine voices for speakers based on preset
-    # This loop is just to pre-populate the cache based on all unique speakers
-    # The actual voice assignment per line happens inside the task creation loop
-    # To ensure stability of voice per speaker if "Random" is chosen.
-    _ = [get_voice_for_speaker(p["speaker"], speaker_voice_preset, parsed_lines) for p in parsed_lines]
     for i, line_data in enumerate(parsed_lines):
-        voice = get_voice_for_speaker(line_data["speaker"], speaker_voice_preset, parsed_lines)
-        output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
-        progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({line_data['speaker']})")
         tasks.append(
             synthesize_speech_line(
                 client=async_openai_client,
                 text=line_data["text"],
-                voice=voice,
                 output_path=output_filename,
                 model=tts_model,
                 nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
                 line_index=line_data['id']
             )
@@ -148,60 +259,29 @@ async def handle_script_processing(
     synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
-    # Place results into line_audio_files based on original line_data['id'] if possible,
-    # or simply by order of completion if IDs are not perfectly mapping (should not happen with current setup).
-    # For now, assuming `synthesis_results` order matches `parsed_lines` due to `asyncio.gather` preserving order.
     for idx, result in enumerate(synthesis_results):
         if isinstance(result, Exception):
-            # Log the error, the file will remain None
             print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
-            # Potentially provide feedback to user about failed lines
-        elif result is None: # Synthesis skipped (e.g. NSFW) or failed internally
             print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
         else:
-            # Store the path of the successfully synthesized file
-            # We need to map 'idx' from gather result back to original 'id' if they differ
-            # For now, assume parsed_lines[idx]['id'] is the relevant one.
-            # line_audio_files is already indexed by 'idx' which corresponds to parsed_lines order.
             line_audio_files[idx] = result
     valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     if not valid_audio_files:
-        shutil.rmtree(job_audio_path_prefix) # Clean up
-        return None, None, "Error: No audio files were successfully synthesized. Check logs for details."
-    # 1. Create ZIP of individual lines
     zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_filename, 'w') as zf:
         for audio_file_path in valid_audio_files:
             zf.write(audio_file_path, os.path.basename(audio_file_path))
-    # 2. Create merged MP3
-    # We need to pass the *ordered* list of successfully generated files for merging
-    # `line_audio_files` contains paths or None, in the original script order.
     ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
     merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
-    if not merged_output_path:
-        # Merging failed, but we might still have the zip.
-        # Return zip, and None for merged, with an error message.
-        # For simplicity now, let's consider this a partial success if zip is there.
-        # Or, make it an error:
-        # shutil.rmtree(job_audio_path_prefix)
-        # return None, None, "Error: Failed to merge audio files, though individual lines might be available."
-        # Let's return what we have
-        print("Warning: Merged MP3 generation failed. Individual files might still be in ZIP.")
-        # We still return the zip path, and None for merged path
-    # Gradio output: individual_lines_zip, merged_dialogue_mp3, status_message
-    # If merged_output_path is None, Gradio will show nothing for that component if its type expects a file.
-    # It's better to return a string path, even if the file doesn't exist, and let Gradio handle it,
-    # or handle it by returning None and ensure the component can take None.
-    # For File output, returning None is fine.
     final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
     if len(valid_audio_files) < len(parsed_lines):
         final_status += "Some lines failed or were skipped. "
@@ -212,20 +292,6 @@ async def handle_script_processing(
     else:
         final_status += "Outputs generated."
-    # Make copies of the files to a location Gradio can serve if they are in the job_audio_path_prefix
-    # which might be cleaned up. Gradio makes its own copies for File output components.
-    # So, returning paths from job_audio_path_prefix is fine.
-    # Return paths for Gradio File components
-    # Gradio will make these downloadable.
-    # If a file doesn't exist or is None, the Gradio component should handle it gracefully (e.g., show nothing).
-    # Intentionally DO NOT clean up job_audio_path_prefix here.
-    # Gradio needs access to these files to serve them.
-    # Cleanup should happen at the start of the *next* run, or via a different mechanism
-    # if HF Spaces doesn't clean /tmp periodically.
-    # The current strategy of clearing job_audio_path_prefix at the start of handle_script_processing is good.
     return zip_filename if os.path.exists(zip_filename) else None, \
            merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
            final_status
@@ -240,44 +306,72 @@ def handle_calculate_cost(dialogue_script: str, tts_model: str):
              return "Cost: $0.000000 (No valid lines)"
         cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
         return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
-    except ValueError as e: # Handles MAX_SCRIPT_LENGTH
         return f"Error: {str(e)}"
     except Exception as e:
         return f"Error calculating cost: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# Dialogue Script to Speech Converter")
     gr.Markdown(
-        "Convert an entire dialogue script into individual MP3 files for each line and a single merged MP3. "
-        "Supports `tts-1-hd` and `gpt-4o-mini-tts` from OpenAI. "
-        "Uses a maximum of 2 concurrent TTS requests."
     )
     if not OPENAI_API_KEY or not async_openai_client:
-        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. The application will not be able to synthesize audio.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
             script_input = gr.TextArea(
                 label="Dialogue Script",
-                placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi! You are a bold one.",
                 lines=10,
                 info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
             )
         with gr.Column(scale=1):
             tts_model_dropdown = gr.Dropdown(
-                TTS_MODELS, label="OpenAI TTS Model", value=MODEL_DEFAULT, info="Select TTS model."
-            )
-            speaker_voice_preset_dropdown = gr.Dropdown(
-                SPEAKER_VOICE_MAPPING_PRESETS, label="Speaker Voice Assignment", value=SPEAKER_VOICE_MAPPING_PRESETS[0],
-                info="How voices are assigned to speakers."
             )
             pause_input = gr.Number(
                 label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
                 info="Silence duration in milliseconds between merged lines."
             )
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Estimated Cost")
         generate_button = gr.Button("Generate Audio Files", variant="primary")
@@ -288,8 +382,38 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
         merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
-    status_output = gr.Textbox(label="Status", interactive=False)
     calculate_cost_button.click(
         fn=handle_calculate_cost,
         inputs=[script_input, tts_model_dropdown],
@@ -298,34 +422,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     generate_button.click(
         fn=handle_script_processing,
-        inputs=[script_input, tts_model_dropdown, pause_input, speaker_voice_preset_dropdown],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
-        # api_name="generate_audio" # if you want to expose an API endpoint
     )
     gr.Markdown("## Examples")
     gr.Examples(
         examples=[
-            [
-                "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more.",
-                "tts-1-hd", 250, SPEAKER_VOICE_MAPPING_PRESETS[1]
-            ],
-            [
-                "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain.\n[Captain Eva] Excellent. Maintain course for Kepler-186f.",
-                "gpt-4o-mini-tts", 600, SPEAKER_VOICE_MAPPING_PRESETS[2]
-            ],
-            [
-                "A single line of narration, no speaker tag initially. This will be auto-assigned to 'Narrator'.",
-                "tts-1", 0, SPEAKER_VOICE_MAPPING_PRESETS[0] # tts-1 to show it works too
-            ]
         ],
-        inputs=[script_input, tts_model_dropdown, pause_input, speaker_voice_preset_dropdown],
-        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output], # Outputs for example click
-        fn=handle_script_processing, # Function to run for examples
-        cache_examples=False, # Set to True if inputs/outputs are static and large
     )
 if __name__ == "__main__":
-    if os.name == 'nt': # For Windows compatibility with asyncio if needed
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     demo.launch()

 import shutil
 import zipfile
 import random
+import json
 from openai import AsyncOpenAI
 from utils.script_parser import parse_dialogue_script, calculate_cost, MAX_SCRIPT_LENGTH
+from utils.openai_tts import synthesize_speech_line, OPENAI_VOICES as ALL_TTS_VOICES # Renamed for clarity
 from utils.merge_audio import merge_mp3_files
 # --- Configuration ---
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+NSFW_API_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
+MODEL_DEFAULT = os.getenv("MODEL_DEFAULT", "tts-1-hd")
 # Ensure API key is available
 if not OPENAI_API_KEY:
     try:
         from huggingface_hub import HfApi
         api = HfApi()
             MODEL_DEFAULT = secrets.get("MODEL_DEFAULT", MODEL_DEFAULT)
     except Exception as e:
         print(f"Could not retrieve secrets from Hugging Face Hub: {e}")
 async_openai_client = None
 if OPENAI_API_KEY:
     async_openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
 else:
+    print("ERROR: OPENAI_API_KEY secret is not set. The application will not function properly.")
+# TTS Models available in the UI
+TTS_MODELS_AVAILABLE = ["tts-1", "tts-1-hd", "gpt-4o-mini-tts"]
+if MODEL_DEFAULT not in TTS_MODELS_AVAILABLE: # Ensure default is valid
+    MODEL_DEFAULT = "tts-1-hd"
+SPEAKER_CONFIG_METHODS = [
+    "Single Voice (Global)",
+    "Random per Speaker",
+    "A/B Round Robin",
+    "Detailed Configuration (JSON)"
+]
+DEFAULT_SPEAKER_CONFIG_METHOD = "Random per Speaker"
+# For UI elements that need the list of voices
+APP_AVAILABLE_VOICES = ALL_TTS_VOICES.copy()
+# Global store for parsed speaker configurations
+_speaker_config_cache = {}
+def parse_detailed_speaker_config(json_text, parsed_script_lines):
+    """ Parses the JSON config string into a speaker map. """
+    config_map = {}
+    default_voice = APP_AVAILABLE_VOICES[0]
+    try:
+        if not json_text.strip():
+            return {} # Empty config means rely on global or other fallbacks
+        config_list = json.loads(json_text)
+        if not isinstance(config_list, list):
+            raise ValueError("JSON config must be a list of speaker objects.")
+        for item in config_list:
+            if not isinstance(item, dict) or "speaker" not in item or "voice" not in item:
+                # Log warning or skip malformed item
+                print(f"Skipping malformed item in JSON config: {item}")
+                continue
+            if item["voice"] not in APP_AVAILABLE_VOICES:
+                print(f"Warning: Voice '{item['voice']}' for speaker '{item['speaker']}' not recognized. Falling back to '{default_voice}'.")
+                item["voice"] = default_voice
+            # Validate speed if present
+            if "speed" in item:
+                try:
+                    item["speed"] = float(item["speed"])
+                    if not (0.25 <= item["speed"] <= 4.0):
+                        print(f"Warning: Speed for speaker '{item['speaker']}' out of range (0.25-4.0). Clamping or defaulting.")
+                        item["speed"] = max(0.25, min(item["speed"], 4.0)) # Clamp
+                except ValueError:
+                    print(f"Warning: Invalid speed value for speaker '{item['speaker']}'. Using default.")
+                    # Let it be None or remove, so global/default speed applies
+                    item.pop("speed", None)
+            config_map[item["speaker"]] = {
+                "voice": item["voice"],
+                "speed": item.get("speed"), # Will be None if not present or invalid
+                "instructions": item.get("instructions") # Will be None if not present
+            }
+        return config_map
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in Detailed Speaker Configuration: {e}")
+    except ValueError as e: # Catch our own ValueErrors
+        raise e # Re-raise
+    except Exception as e: # Catch any other unexpected errors during parsing
+        raise ValueError(f"Error parsing Detailed Speaker Configuration: {e}")
+def get_config_for_speaker(speaker_name, speaker_config_method, unique_script_speakers,
+                           global_selected_voice, detailed_config_map):
+    global _speaker_config_cache # This cache helps maintain consistency for "Random" and "A/B" within a run
+    # If method changed or cache is for a different set of speakers, reset it.
+    # A more robust cache key might involve hashing unique_script_speakers.
+    if _speaker_config_cache.get("__method") != speaker_config_method or \
+       _speaker_config_cache.get("__speakers_set") != frozenset(unique_script_speakers):
+        _speaker_config_cache = {"__method": speaker_config_method, "__speakers_set": frozenset(unique_script_speakers)}
+    base_config = {"voice": APP_AVAILABLE_VOICES[0], "speed": None, "instructions": None}
+    if speaker_config_method == "Single Voice (Global)":
+        base_config["voice"] = global_selected_voice if global_selected_voice in APP_AVAILABLE_VOICES else APP_AVAILABLE_VOICES[0]
+        return base_config
+    if speaker_config_method == "Detailed Configuration (JSON)":
+        if speaker_name in detailed_config_map:
+            # Merge with base_config to ensure all keys are present if some are optional in JSON
+            # JSON values take precedence
+            speaker_specific = detailed_config_map[speaker_name]
+            return {
+                "voice": speaker_specific.get("voice", base_config["voice"]),
+                "speed": speaker_specific.get("speed"), # Allow None to use global
+                "instructions": speaker_specific.get("instructions") # Allow None to use global
+            }
+        else: # Fallback for speakers in script but not in JSON map
+            # Could use a default voice, or a cycling voice for unmapped speakers
+            # For now, let's use the first voice as a simple fallback.
+            print(f"Warning: Speaker '{speaker_name}' not found in Detailed JSON. Using default voice '{base_config['voice']}'.")
+            return base_config
+    # For "Random" and "A/B", we only map voices. Speed/Instructions will be global.
+    if speaker_name not in _speaker_config_cache:
+        if speaker_config_method == "Random per Speaker":
+            # Assign a random voice if not already cached for this run
+            available_voices_shuffled = random.sample(APP_AVAILABLE_VOICES, len(APP_AVAILABLE_VOICES))
+            # Ensure all unique speakers get an assignment before reusing voices from cache build
+            if not _speaker_config_cache.get("__all_assigned_random"):
+                 for i, spk_unique in enumerate(unique_script_speakers):
+                     if spk_unique not in _speaker_config_cache:
+                        _speaker_config_cache[spk_unique] = {"voice": available_voices_shuffled[i % len(available_voices_shuffled)]}
+                 _speaker_config_cache["__all_assigned_random"] = True
+            # If somehow still not found (should not happen if pre-populated)
+            if speaker_name not in _speaker_config_cache:
+                 _speaker_config_cache[speaker_name] = {"voice": random.choice(APP_AVAILABLE_VOICES)}
+        elif speaker_config_method == "A/B Round Robin":
+            # Assign voices in order if not already cached
+            if not _speaker_config_cache.get("__all_assigned_ab"):
+                for i, spk_unique in enumerate(unique_script_speakers):
+                     if spk_unique not in _speaker_config_cache:
+                        _speaker_config_cache[spk_unique] = {"voice": APP_AVAILABLE_VOICES[i % len(APP_AVAILABLE_VOICES)]}
+                _speaker_config_cache["__all_assigned_ab"] = True
+            if speaker_name not in _speaker_config_cache: # Fallback, should be populated
+                speaker_idx = unique_script_speakers.index(speaker_name) if speaker_name in unique_script_speakers else 0
+                _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[speaker_idx % len(APP_AVAILABLE_VOICES)]}
+        else: # Should not happen
+             _speaker_config_cache[speaker_name] = {"voice": APP_AVAILABLE_VOICES[0]}
+    # Return only voice for Random/AB, speed/instructions will be from global UI inputs
+    cached_entry = _speaker_config_cache.get(speaker_name, base_config.copy())
+    return {"voice": cached_entry.get("voice", base_config["voice"]), "speed": None, "instructions": None}
 async def handle_script_processing(
     dialogue_script: str,
     tts_model: str,
     pause_ms: int,
+    speaker_config_method: str,
+    global_voice_selection: str, # From dropdown if "Single Voice (Global)"
+    detailed_speaker_json: str, # From JSON input
+    global_speed: float,
+    global_instructions: str,
     progress=gr.Progress(track_tqdm=True)
     ):
+    global _speaker_config_cache
+    _speaker_config_cache = {} # Reset speaker config cache for each new run
     if not OPENAI_API_KEY or not async_openai_client:
         return None, None, "Error: OPENAI_API_KEY is not configured. Cannot proceed."
     if not dialogue_script.strip():
         return None, None, "Error: Script is empty."
     job_audio_path_prefix = os.path.join(tempfile.gettempdir(), "current_job_audio")
     if os.path.exists(job_audio_path_prefix):
         shutil.rmtree(job_audio_path_prefix)
     os.makedirs(job_audio_path_prefix, exist_ok=True)
     try:
         parsed_lines, total_chars = parse_dialogue_script(dialogue_script)
         if not parsed_lines:
+            return None, None, "Error: No valid dialogue lines found."
+    except ValueError as e:
         return None, None, f"Script parsing error: {str(e)}"
+    unique_speakers = sorted(list(set(p["speaker"] for p in parsed_lines)))
+    parsed_detailed_config_map = {}
+    if speaker_config_method == "Detailed Configuration (JSON)":
+        try:
+            parsed_detailed_config_map = parse_detailed_speaker_config(detailed_speaker_json, parsed_lines)
+        except ValueError as e:
+            return None, None, f"Configuration Error: {str(e)}"
+    tasks = []
+    line_audio_files = [None] * len(parsed_lines)
     for i, line_data in enumerate(parsed_lines):
+        speaker_name = line_data["speaker"]
+        # Get base config (primarily voice) based on method
+        # For "Random" and "A/B", this will just return voice. Speed/instructions are taken from global.
+        # For "Single Voice", it returns the globally selected voice.
+        # For "Detailed JSON", it returns voice, and potentially per-speaker speed/instructions.
+        speaker_base_cfg = get_config_for_speaker(
+            speaker_name, speaker_config_method, unique_speakers,
+            global_voice_selection, parsed_detailed_config_map
+        )
+        line_voice = speaker_base_cfg["voice"]
+        # Determine effective speed and instructions
+        # Priority: Per-speaker from JSON > Global UI > API Default (1.0 for speed, None for instructions)
+        effective_speed = global_speed # Start with global
+        if speaker_base_cfg.get("speed") is not None: # If JSON provided a speed for this speaker
+            effective_speed = speaker_base_cfg["speed"]
+        effective_instructions = global_instructions if global_instructions and global_instructions.strip() else None
+        if speaker_base_cfg.get("instructions") is not None and speaker_base_cfg["instructions"].strip(): # If JSON provided instructions
+            effective_instructions = speaker_base_cfg["instructions"]
+        output_filename = os.path.join(job_audio_path_prefix, f"line_{line_data['id']}.mp3")
+        progress(i / len(parsed_lines), desc=f"Synthesizing line {i+1}/{len(parsed_lines)} ({speaker_name} w/ {line_voice})")
         tasks.append(
             synthesize_speech_line(
                 client=async_openai_client,
                 text=line_data["text"],
+                voice=line_voice,
                 output_path=output_filename,
                 model=tts_model,
+                speed=effective_speed,
+                instructions=effective_instructions,
                 nsfw_api_url_template=NSFW_API_URL_TEMPLATE,
                 line_index=line_data['id']
             )
     synthesis_results = await asyncio.gather(*tasks, return_exceptions=True)
     for idx, result in enumerate(synthesis_results):
         if isinstance(result, Exception):
             print(f"Error during synthesis for line {parsed_lines[idx]['id']}: {result}")
+        elif result is None:
             print(f"Synthesis skipped or failed for line {parsed_lines[idx]['id']}")
         else:
             line_audio_files[idx] = result
     valid_audio_files = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     if not valid_audio_files:
+        shutil.rmtree(job_audio_path_prefix)
+        return None, None, "Error: No audio files were successfully synthesized."
     zip_filename = os.path.join(job_audio_path_prefix, "dialogue_lines.zip")
     with zipfile.ZipFile(zip_filename, 'w') as zf:
         for audio_file_path in valid_audio_files:
             zf.write(audio_file_path, os.path.basename(audio_file_path))
     ordered_valid_files_for_merge = [f for f in line_audio_files if f and os.path.exists(f) and os.path.getsize(f) > 0]
     merged_mp3_filename = os.path.join(job_audio_path_prefix, "merged_dialogue.mp3")
     merged_output_path = merge_mp3_files(ordered_valid_files_for_merge, merged_mp3_filename, pause_ms)
     final_status = f"Processed {len(valid_audio_files)}/{len(parsed_lines)} lines. "
     if len(valid_audio_files) < len(parsed_lines):
         final_status += "Some lines failed or were skipped. "
     else:
         final_status += "Outputs generated."
     return zip_filename if os.path.exists(zip_filename) else None, \
            merged_output_path if merged_output_path and os.path.exists(merged_output_path) else None, \
            final_status
              return "Cost: $0.000000 (No valid lines)"
         cost = calculate_cost(total_chars, len(parsed_lines), tts_model)
         return f"Estimated OpenAI Cost: ${cost:.6f} (for {total_chars} characters in {len(parsed_lines)} lines)"
+    except ValueError as e:
         return f"Error: {str(e)}"
     except Exception as e:
         return f"Error calculating cost: {str(e)}"
 # --- Gradio Interface ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Dialogue Script to Speech Converter (Enhanced)")
     gr.Markdown(
+        "Convert dialogue scripts to speech with fine-grained control over voices, speed, and instructions."
     )
     if not OPENAI_API_KEY or not async_openai_client:
+        gr.Markdown("<h3 style='color:red;'>Warning: OPENAI_API_KEY secret is not set or invalid. Speech synthesis will fail.</h3>")
     with gr.Row():
         with gr.Column(scale=2):
             script_input = gr.TextArea(
                 label="Dialogue Script",
+                placeholder="[Speaker One] Hello there!\n[Speaker Two] General Kenobi!",
                 lines=10,
                 info=f"Format: `[Speaker] Utterance` per line. Max {MAX_SCRIPT_LENGTH} chars total."
             )
         with gr.Column(scale=1):
             tts_model_dropdown = gr.Dropdown(
+                TTS_MODELS_AVAILABLE, label="OpenAI TTS Model", value=MODEL_DEFAULT,
+                info="Select TTS model. Affects available controls below."
             )
             pause_input = gr.Number(
                 label="Pause Between Lines (ms)", value=500, minimum=0, maximum=5000, step=50,
                 info="Silence duration in milliseconds between merged lines."
             )
+            # Model-specific global controls
+            global_speed_input = gr.Slider(
+                minimum=0.25, maximum=4.0, value=1.0, step=0.05,
+                label="Global Speech Speed (for tts-1/tts-1-hd)",
+                visible= (MODEL_DEFAULT in ["tts-1", "tts-1-hd"]),
+                interactive=True
+            )
+            global_instructions_input = gr.Textbox(
+                label="Global Voice Instructions (for gpt-4o-mini-tts)",
+                placeholder="e.g., Speak in a calm, reassuring tone.",
+                visible=(MODEL_DEFAULT == "gpt-4o-mini-tts"),
+                interactive=True, lines=2
+            )
+    gr.Markdown("### Speaker Configuration")
+    with gr.Row():
+        speaker_config_method_dropdown = gr.Dropdown(
+            SPEAKER_CONFIG_METHODS, label="Speaker Configuration Method", value=DEFAULT_SPEAKER_CONFIG_METHOD
+        )
+        global_voice_dropdown = gr.Dropdown(
+            APP_AVAILABLE_VOICES, label="Global Voice (for 'Single Voice' method)",
+            value=APP_AVAILABLE_VOICES[0],
+            visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Single Voice (Global)"),
+            interactive=True
+        )
+    detailed_speaker_config_input = gr.Code(
+        label="Detailed Speaker Configuration (JSON)",
+        language="json",
+        lines=7,
+        placeholder='[\n  {"speaker": "Alice", "voice": "nova", "speed": 1.1, "instructions": "sound excited"},\n  {"speaker": "Bob", "voice": "echo"},\n  {"speaker": "Narrator", "voice": "shimmer", "instructions": "be very serious"}\n]',
+        visible=(DEFAULT_SPEAKER_CONFIG_METHOD == "Detailed Configuration (JSON)"),
+        info="Define voice, and optionally model-compatible speed/instructions per speaker."
+    )
     with gr.Row():
         calculate_cost_button = gr.Button("Calculate Estimated Cost")
         generate_button = gr.Button("Generate Audio Files", variant="primary")
         individual_lines_zip_output = gr.File(label="Download ZIP of Individual Lines")
         merged_dialogue_mp3_output = gr.Audio(label="Merged Dialogue MP3", type="filepath")
+    status_output = gr.Textbox(label="Status", interactive=False, lines=2)
+    # --- Event Handlers for UI Interactivity ---
+    def update_model_specific_controls_visibility(selected_model):
+        is_tts_1_family = selected_model in ["tts-1", "tts-1-hd"]
+        is_gpt_mini = selected_model == "gpt-4o-mini-tts" # And any other future models supporting instructions
+        return {
+            global_speed_input: gr.update(visible=is_tts_1_family, interactive=is_tts_1_family),
+            global_instructions_input: gr.update(visible=is_gpt_mini, interactive=is_gpt_mini)
+        }
+    tts_model_dropdown.change(
+        fn=update_model_specific_controls_visibility,
+        inputs=[tts_model_dropdown],
+        outputs=[global_speed_input, global_instructions_input]
+    )
+    def update_speaker_config_visibility(config_method):
+        is_single_voice = (config_method == "Single Voice (Global)")
+        is_detailed_json = (config_method == "Detailed Configuration (JSON)")
+        return {
+            global_voice_dropdown: gr.update(visible=is_single_voice, interactive=is_single_voice),
+            detailed_speaker_config_input: gr.update(visible=is_detailed_json, interactive=is_detailed_json)
+        }
+    speaker_config_method_dropdown.change(
+        fn=update_speaker_config_visibility,
+        inputs=[speaker_config_method_dropdown],
+        outputs=[global_voice_dropdown, detailed_speaker_config_input]
+    )
+    # --- Main Button Actions ---
     calculate_cost_button.click(
         fn=handle_calculate_cost,
         inputs=[script_input, tts_model_dropdown],
     generate_button.click(
         fn=handle_script_processing,
+        inputs=[
+            script_input, tts_model_dropdown, pause_input,
+            speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
+            global_speed_input, global_instructions_input
+        ],
         outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
     )
     gr.Markdown("## Examples")
+    example_script_1 = "[Alice] Hi Bob, how are things?\n[Bob] Pretty good, Alice. Just working on this new project.\n[Alice] Oh, interesting! Tell me more."
+    example_json_1 = """
+[
+  {"speaker": "Alice", "voice": "nova", "instructions": "sound curious"},
+  {"speaker": "Bob", "voice": "echo", "speed": 0.9}
+]
+    """.strip()
+    example_script_2 = "[Captain Eva] Computer, status report.\n[Computer] All systems nominal, Captain. I am speaking slowly.\n[Captain Eva] Excellent. Maintain course for Kepler-186f."
+    example_json_2 = """
+[
+  {"speaker": "Captain Eva", "voice": "alloy", "speed": 1.0},
+  {"speaker": "Computer", "voice": "onyx", "speed": 0.8, "instructions": "sound robotic and calm"}
+]
+    """.strip()
     gr.Examples(
         examples=[
+            [example_script_1, "gpt-4o-mini-tts", 250, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_1, 1.0, "Speak naturally."],
+            [example_script_2, "tts-1-hd", 300, "Detailed Configuration (JSON)", APP_AVAILABLE_VOICES[0], example_json_2, 1.1, ""],
+            ["[Narrator] A single line, using global settings.", "tts-1", 0, "Single Voice (Global)", "fable", "", 1.2, ""]
         ],
+        inputs=[
+            script_input, tts_model_dropdown, pause_input,
+            speaker_config_method_dropdown, global_voice_dropdown, detailed_speaker_config_input,
+            global_speed_input, global_instructions_input
+        ],
+        outputs=[individual_lines_zip_output, merged_dialogue_mp3_output, status_output],
+        fn=handle_script_processing,
+        cache_examples=False,
     )
 if __name__ == "__main__":
+    if os.name == 'nt':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     demo.launch()

utils/openai_tts.py CHANGED Viewed

@@ -4,8 +4,8 @@ import time
 from openai import AsyncOpenAI, OpenAIError, RateLimitError
 import httpx # For NSFW check
-# Available voices for OpenAI TTS
-OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
 # Concurrency limiter
 MAX_CONCURRENT_REQUESTS = 2
@@ -21,33 +21,27 @@ async def is_content_safe(text: str, api_url_template: str | None) -> bool:
     Returns True if safe or if API URL is not provided, False if unsafe.
     """
     if not api_url_template:
-        return True # Assume safe if no NSFW checker is configured
-    # Basic template check - expecting {text} placeholder
     if "{text}" not in api_url_template:
         print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
         return True
     try:
-        encoded_text = httpx.utils.quote(text) # URL encode the text
         url = api_url_template.format(text=encoded_text)
         async with httpx.AsyncClient() as client:
             response = await client.get(url, timeout=10.0)
-        # Assuming 200 OK means "safe" and other statuses might mean "unsafe" or error
-        # This logic might need adjustment based on the specific API's response codes
         if response.status_code == 200:
-            # Further, check response content if API specifies (e.g., JSON payload)
-            # For a generic template, we'll assume 200 means safe.
-            # Example: response_json = response.json(); return response_json.get("is_safe", False)
             return True
         else:
             print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
             return False
     except httpx.RequestError as e:
         print(f"NSFW Check: API request error: {e}")
-        return False # Fail safe (consider content unsafe on API error)
     except Exception as e:
         print(f"NSFW Check: An unexpected error occurred: {e}")
         return False
@@ -59,36 +53,49 @@ async def synthesize_speech_line(
     voice: str,
     output_path: str,
     model: str = "tts-1-hd",
     nsfw_api_url_template: str | None = None,
-    line_index: int = -1 # For logging purposes
 ) -> str | None:
     """
     Synthesizes a single line of text to speech using OpenAI TTS.
     Retries on RateLimitError with exponential backoff.
     Returns the output_path if successful, None otherwise.
     """
     if nsfw_api_url_template:
         if not await is_content_safe(text, nsfw_api_url_template):
             print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
-            # Create a silent MP3 or handle as needed; here we skip and return None
-            # This could also raise an exception to halt processing.
             return None
     current_retry = 0
     backoff_seconds = INITIAL_BACKOFF_SECONDS
-    async with semaphore: # Acquire semaphore before making API call
         while current_retry < MAX_RETRIES:
             try:
-                response = await client.audio.speech.create(
-                    model=model,
-                    voice=voice,
-                    input=text,
-                    response_format="mp3"
-                )
                 await response.astream_to_file(output_path)
-                # print(f"Successfully synthesized line {line_index if line_index !=-1 else ''} to {output_path} using voice {voice}")
                 return output_path
             except RateLimitError as e:
                 current_retry += 1
@@ -97,19 +104,16 @@ async def synthesize_speech_line(
                     return None
                 print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
                 await asyncio.sleep(backoff_seconds)
-                backoff_seconds *= 2  # Exponential backoff
             except OpenAIError as e:
                 print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
-                return None # Other OpenAI errors
             except Exception as e:
                 print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
                 return None
-    return None # Should not be reached if semaphore logic is correct, but as a fallback.
 if __name__ == '__main__':
-    # This is a basic test and requires OPENAI_API_KEY to be set in environment
-    # and a temporary directory to exist or be created.
     async def main_test():
         api_key = os.getenv("OPENAI_API_KEY")
         if not api_key:
@@ -119,22 +123,37 @@ if __name__ == '__main__':
         client = AsyncOpenAI(api_key=api_key)
         test_lines = [
-            {"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice."},
-            {"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice."},
-            {"id": 2, "speaker": "Alice", "text": "A short reply."},
-            {"id": 3, "speaker": "Charlie", "text": "Charlie here, with a much longer line of dialogue to see how the concurrency and potential rate limits are handled by the system when multiple requests are fired off."}
         ]
-        temp_dir = "test_audio_output"
         os.makedirs(temp_dir, exist_ok=True)
         tasks = []
         for i, line_data in enumerate(test_lines):
-            voice = OPENAI_VOICES[i % len(OPENAI_VOICES)] # Cycle through voices
-            output_file = os.path.join(temp_dir, f"line_{line_data['id']}.mp3")
-            # Set model to tts-1 for faster testing if desired, default is tts-1-hd
             tasks.append(
-                synthesize_speech_line(client, line_data["text"], voice, output_file, model="tts-1", line_index=line_data['id'])
             )
         results = await asyncio.gather(*tasks)
@@ -143,14 +162,7 @@ if __name__ == '__main__':
         print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
         for f_path in successful_files:
             print(f" - {f_path}")
-        # Clean up test files (optional)
-        # for f_path in successful_files:
-        #     os.remove(f_path)
-        # if not os.listdir(temp_dir): # only remove if empty
-        #    os.rmdir(temp_dir)
-    if os.name == 'nt': # For Windows compatibility with asyncio if needed
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     asyncio.run(main_test())

 from openai import AsyncOpenAI, OpenAIError, RateLimitError
 import httpx # For NSFW check
+# Expanded list of voices based on recent OpenAI documentation
+OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer', 'ash', 'ballad', 'coral', 'sage', 'verse']
 # Concurrency limiter
 MAX_CONCURRENT_REQUESTS = 2
     Returns True if safe or if API URL is not provided, False if unsafe.
     """
     if not api_url_template:
+        return True
     if "{text}" not in api_url_template:
         print("Warning: NSFW_API_URL_TEMPLATE does not contain {text} placeholder. Skipping NSFW check.")
         return True
     try:
+        encoded_text = httpx.utils.quote(text)
         url = api_url_template.format(text=encoded_text)
         async with httpx.AsyncClient() as client:
             response = await client.get(url, timeout=10.0)
         if response.status_code == 200:
             return True
         else:
             print(f"NSFW Check: API request failed or content flagged. Status: {response.status_code}, Response: {response.text[:200]}")
             return False
     except httpx.RequestError as e:
         print(f"NSFW Check: API request error: {e}")
+        return False
     except Exception as e:
         print(f"NSFW Check: An unexpected error occurred: {e}")
         return False
     voice: str,
     output_path: str,
     model: str = "tts-1-hd",
+    speed: float = 1.0,
+    instructions: str | None = None,
     nsfw_api_url_template: str | None = None,
+    line_index: int = -1
 ) -> str | None:
     """
     Synthesizes a single line of text to speech using OpenAI TTS.
+    Includes speed and instructions parameters based on model compatibility.
     Retries on RateLimitError with exponential backoff.
     Returns the output_path if successful, None otherwise.
     """
     if nsfw_api_url_template:
         if not await is_content_safe(text, nsfw_api_url_template):
             print(f"Line {line_index if line_index != -1 else 'N/A'}: Content flagged as NSFW. Skipping synthesis.")
             return None
     current_retry = 0
     backoff_seconds = INITIAL_BACKOFF_SECONDS
+    async with semaphore:
         while current_retry < MAX_RETRIES:
             try:
+                request_params = {
+                    "model": model,
+                    "voice": voice,
+                    "input": text,
+                    "response_format": "mp3"
+                }
+                # Add speed if model supports it and speed is not default
+                if model in ["tts-1", "tts-1-hd"]:
+                    if speed is not None and speed != 1.0: # OpenAI default is 1.0
+                        # Ensure speed is within valid range for safety, though UI should also constrain this
+                        clamped_speed = max(0.25, min(speed, 4.0))
+                        request_params["speed"] = clamped_speed
+                # Add instructions if model supports it and instructions are provided
+                # Assuming gpt-4o-mini-tts supports it, and tts-1/tts-1-hd do not.
+                if model not in ["tts-1", "tts-1-hd"] and instructions: # Example: gpt-4o-mini-tts
+                    request_params["instructions"] = instructions
+                response = await client.audio.speech.create(**request_params)
                 await response.astream_to_file(output_path)
                 return output_path
             except RateLimitError as e:
                 current_retry += 1
                     return None
                 print(f"Line {line_index if line_index != -1 else ''}: Rate limit hit. Retrying in {backoff_seconds}s... (Attempt {current_retry}/{MAX_RETRIES})")
                 await asyncio.sleep(backoff_seconds)
+                backoff_seconds *= 2
             except OpenAIError as e:
                 print(f"Line {line_index if line_index != -1 else ''}: OpenAI API error: {e}")
+                return None
             except Exception as e:
                 print(f"Line {line_index if line_index != -1 else ''}: An unexpected error occurred during synthesis: {e}")
                 return None
+    return None
 if __name__ == '__main__':
     async def main_test():
         api_key = os.getenv("OPENAI_API_KEY")
         if not api_key:
         client = AsyncOpenAI(api_key=api_key)
         test_lines = [
+            {"id": 0, "speaker": "Alice", "text": "Hello, this is a test line for Alice, spoken quickly."},
+            {"id": 1, "speaker": "Bob", "text": "And this is Bob, testing his voice with instructions.", "instructions": "Speak in a deep, resonant voice."},
+            {"id": 2, "speaker": "Alice", "text": "A short reply, spoken slowly.", "speed": 0.8},
+            {"id": 3, "speaker": "Charlie", "text": "Charlie here, normal speed."}
         ]
+        temp_dir = "test_audio_output_enhanced"
         os.makedirs(temp_dir, exist_ok=True)
         tasks = []
         for i, line_data in enumerate(test_lines):
+            # Test with specific models to check param compatibility
+            # For Alice (speed): tts-1-hd. For Bob (instructions): gpt-4o-mini-tts
+            current_model = "tts-1-hd"
+            if "instructions" in line_data:
+                current_model = "gpt-4o-mini-tts" # Example, ensure this model is available for your key
+            voice = OPENAI_VOICES[i % len(OPENAI_VOICES)]
+            output_file = os.path.join(temp_dir, f"line_{line_data['id']}_{current_model}.mp3")
             tasks.append(
+                synthesize_speech_line(
+                    client,
+                    line_data["text"],
+                    voice,
+                    output_file,
+                    model=current_model,
+                    speed=line_data.get("speed", 1.0),
+                    instructions=line_data.get("instructions"),
+                    line_index=line_data['id']
+                )
             )
         results = await asyncio.gather(*tasks)
         print(f"\nSuccessfully synthesized {len(successful_files)} out of {len(test_lines)} lines.")
         for f_path in successful_files:
             print(f" - {f_path}")
+    if os.name == 'nt':
         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
     asyncio.run(main_test())