Spaces:

abocha
/

esl-dialogue-tts

Running

File size: 3,156 Bytes

1190db4

import re
import math

MAX_SCRIPT_LENGTH = 10000  # characters
TTS_1_HD_COST_PER_CHAR = 0.00003  # $30 / 1M chars
GPT_4O_MINI_TTS_COST_PER_SECOND = 0.015 / 60  # $0.015 / minute
CHARS_PER_SECOND_ESTIMATE = 10 # Rough estimate for TTS duration

def parse_dialogue_script(script_text):
    """
    Parses a dialogue script into a list of (index, speaker, utterance) tuples.
    Input format: "[Speaker] Utterance" per line.
    Lines not matching the format are attempted to be parsed as "[Default] Utterance".
    """
    lines = script_text.strip().split('\n')
    parsed_lines = []
    total_chars = 0

    if len(script_text) > MAX_SCRIPT_LENGTH:
        raise ValueError(f"Script is too long. Maximum {MAX_SCRIPT_LENGTH} characters allowed. Your script has {len(script_text)} characters.")

    for i, line_content in enumerate(lines):
        line_content = line_content.strip()
        if not line_content:
            continue

        match = re.match(r'\[(.*?)\]\s*(.*)', line_content)
        if match:
            speaker, utterance = match.groups()
            utterance = utterance.strip()
        else:
            # If no speaker tag, assign a default speaker or handle as per requirements
            # For now, let's assume the whole line is an utterance by a "Narrator" or similar
            speaker = "Narrator" # Or consider raising an error/warning
            utterance = line_content.strip()
        
        if not utterance: # Skip if utterance is empty after parsing
            continue

        parsed_lines.append({"id": i, "speaker": speaker.strip(), "text": utterance})
        total_chars += len(utterance)
        
    return parsed_lines, total_chars

def calculate_cost(total_chars, num_lines, model_name="tts-1-hd"):
    """
    Calculates the estimated cost for TTS processing.
    """
    if model_name == "tts-1-hd":
        cost = total_chars * TTS_1_HD_COST_PER_CHAR
    elif model_name == "gpt-4o-mini-tts":
        # Estimate duration: total_chars / X chars per second
        # This is a very rough estimate. Actual duration depends on OpenAI's model.
        estimated_seconds = total_chars / CHARS_PER_SECOND_ESTIMATE 
        cost = estimated_seconds * GPT_4O_MINI_TTS_COST_PER_SECOND
    else:
        raise ValueError(f"Unknown model for cost calculation: {model_name}")
    return cost

if __name__ == '__main__':
    sample_script = """
    [Alice] Hello Bob, how are you?
    [Bob] I'm fine, Alice. And you?
    This is a line without a speaker tag.
    [Charlie] Just listening in.
    """
    parsed, chars = parse_dialogue_script(sample_script)
    print("Parsed Lines:")
    for p_line in parsed:
        print(p_line)
    print(f"\nTotal Characters: {chars}")

    cost_hd = calculate_cost(chars, len(parsed), "tts-1-hd")
    print(f"Estimated cost for tts-1-hd: ${cost_hd:.6f}")

    cost_gpt_mini = calculate_cost(chars, len(parsed), "gpt-4o-mini-tts")
    print(f"Estimated cost for gpt-4o-mini-tts: ${cost_gpt_mini:.6f}")

    long_script = "a" * (MAX_SCRIPT_LENGTH + 1)
    try:
        parse_dialogue_script(long_script)
    except ValueError as e:
        print(f"Error for long script: {e}")