Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 5,328 Bytes

import re
import math

MAX_SCRIPT_LENGTH = 10000  # characters
TTS_1_HD_COST_PER_CHAR = 0.00003  # $30 / 1M chars for tts-1-hd and tts-1
GPT_4O_MINI_TTS_COST_PER_SECOND = 0.015 / 60  # $0.015 / minute for gpt-4o-mini-tts
CHARS_PER_SECOND_ESTIMATE = 12 # Average characters spoken per second, for estimation

def parse_dialogue_script(script_text):
    """
    Parses a dialogue script into a list of dictionaries, each representing a line.
    Input format: "[Speaker] Utterance" per line.
    Lines not matching the format are assigned to a "Narrator" speaker.
    """
    lines = script_text.strip().split('\n')
    parsed_lines = []
    total_chars = 0

    if len(script_text) > MAX_SCRIPT_LENGTH:
        raise ValueError(f"Script is too long. Maximum {MAX_SCRIPT_LENGTH} characters allowed. Your script has {len(script_text)} characters.")

    for i, line_content in enumerate(lines):
        line_content = line_content.strip()
        if not line_content:
            continue # Skip empty lines

        match = re.match(r'\[(.*?)\]\s*(.*)', line_content)
        if match:
            speaker, utterance = match.groups()
            speaker = speaker.strip()
            utterance = utterance.strip()
            if not speaker: # If speaker tag is empty like "[] Text"
                speaker = "UnknownSpeaker"
        else:
            # If no speaker tag, assign the whole line as utterance by "Narrator"
            speaker = "Narrator"
            utterance = line_content # Already stripped
        
        if not utterance: # Skip if utterance is empty after parsing (e.g. "[Speaker]" with no text)
            continue

        parsed_lines.append({"id": i, "speaker": speaker, "text": utterance})
        total_chars += len(utterance)
        
    return parsed_lines, total_chars

def calculate_cost(total_chars, num_lines, model_name="tts-1-hd"):
    """
    Calculates the estimated cost for TTS processing.
    """
    cost = 0.0
    if model_name in ["tts-1", "tts-1-hd"]: # OpenAI charges same for tts-1 and tts-1-hd
        cost = total_chars * TTS_1_HD_COST_PER_CHAR 
    elif model_name == "gpt-4o-mini-tts":
        # Estimate duration: total_chars / X chars per second. This is a rough estimate.
        # OpenAI pricing for gpt-4o-mini's TTS is by character, similar to tts-1.
        # As of latest check, gpt-4o-mini is priced same as tts-1.
        # $0.000015 / char ($15.00 / 1M characters)
        # Let's update cost for gpt-4o-mini-tts if it differs.
        # The prompt says: "# seconds × $0.015   for gpt‑4o‑mini‑tts (0.015 USD / minute)"
        # This conflicts with OpenAI's typical character-based TTS pricing.
        # Assuming prompt's per-second pricing is the requirement for gpt-4o-mini-tts for this exercise.
        if CHARS_PER_SECOND_ESTIMATE <= 0: # Avoid division by zero
            estimated_seconds = total_chars / 10.0 # Fallback chars/sec
        else:
            estimated_seconds = total_chars / CHARS_PER_SECOND_ESTIMATE
        cost = estimated_seconds * GPT_4O_MINI_TTS_COST_PER_SECOND
    else: # Fallback to character-based costing for any other tts-1 like model
        cost = total_chars * TTS_1_HD_COST_PER_CHAR
        # raise ValueError(f"Unknown model for cost calculation: {model_name}") # Or assume default if model not matched
    return cost

if __name__ == '__main__':
    sample_script_1 = """
    [Alice] Hello Bob, how are you?
    [Bob] I'm fine, Alice. And you?
    This is a line without a speaker tag.
    [Charlie] Just listening in.
    [] This line has an empty speaker tag.
    [EmptySpeakerText]
    """
    print(f"--- Test Case 1: Mixed Script ---")
    parsed, chars = parse_dialogue_script(sample_script_1)
    print("Parsed Lines:")
    for p_line in parsed:
        print(p_line)
    print(f"\nTotal Characters for TTS: {chars}")

    cost_hd = calculate_cost(chars, len(parsed), "tts-1-hd")
    print(f"Estimated cost for tts-1-hd: ${cost_hd:.6f}")

    cost_tts1 = calculate_cost(chars, len(parsed), "tts-1")
    print(f"Estimated cost for tts-1: ${cost_tts1:.6f}")
    
    # Test cost for gpt-4o-mini-tts using the per-second formula
    cost_gpt_mini = calculate_cost(chars, len(parsed), "gpt-4o-mini-tts")
    print(f"Estimated cost for gpt-4o-mini-tts (at {CHARS_PER_SECOND_ESTIMATE} chars/sec): ${cost_gpt_mini:.6f}")


    print(f"\n--- Test Case 2: Long Script (Boundary Check) ---")
    long_script_text = "[SpeakerA] " + "a" * (MAX_SCRIPT_LENGTH - 11) # 11 chars for "[SpeakerA] "
    parsed_long, chars_long = parse_dialogue_script(long_script_text)
    print(f"Long script (length {len(long_script_text)} chars) parsed successfully. TTS Chars: {chars_long}")

    try:
        too_long_script = "a" * (MAX_SCRIPT_LENGTH + 1)
        parse_dialogue_script(too_long_script)
    except ValueError as e:
        print(f"Correctly caught error for too long script: {e}")

    print(f"\n--- Test Case 3: Empty and Invalid Scripts ---")
    parsed_empty, chars_empty = parse_dialogue_script("")
    print(f"Empty script: Parsed lines: {len(parsed_empty)}, Chars: {chars_empty}")
    parsed_blank_lines, chars_blank_lines = parse_dialogue_script("\n\n[Speaker]\n\n")
    print(f"Script with blank/invalid lines: Parsed lines: {len(parsed_blank_lines)}, Chars: {chars_blank_lines} (Result: {parsed_blank_lines})")