import spaces  # must come first
import gradio as gr
import numpy as np
import torch
from generator import load_csm_1b, Segment

SAMPLE_RATE = 24000
CONTEXT_WINDOW = 3

generator = load_csm_1b(device="cuda")

def make_silence(duration_ms=300):
    num_samples = int((SAMPLE_RATE * duration_ms) / 1000)
    return np.zeros(num_samples, dtype=np.float32)

def fade(audio_np, fade_duration_ms=200):
    fade_len = int((fade_duration_ms / 1000) * SAMPLE_RATE)
    fade_in = np.linspace(0, 1, fade_len)
    fade_out = np.linspace(1, 0, fade_len)
    audio_np[:fade_len] *= fade_in
    audio_np[-fade_len:] *= fade_out
    return audio_np

@spaces.GPU(duration=30)
def infer(input_text, temp, top_k, top_p, fade_ms, pause_ms):
    lines = [line.strip() for line in input_text.strip().split("\n") if line.strip()]
    all_audio = []
    context = []

    for line in lines:
        if line == "__PAUSE__":
            all_audio.append(make_silence(pause_ms))
            continue

        trimmed_context = context[-CONTEXT_WINDOW:]

        audio = generator.generate(
            text=line,
            speaker=0,
            context=trimmed_context,
            max_audio_length_ms=8000,
            temperature=temp,
            topk=top_k,
            top_p=top_p,
        )

        audio_np = audio.cpu().numpy()
        audio_np = fade(audio_np, fade_duration_ms=fade_ms)
        all_audio.append(audio_np)

        context.append(Segment(text=line, speaker=0, audio=audio))

    if not all_audio:
        raise gr.Error("Please enter some text.")

    full_audio = np.concatenate(all_audio)
    audio_int16 = (full_audio * 32768).astype(np.int16)
    return SAMPLE_RATE, audio_int16

DEFAULT_TEXT = """I hunger...
__PAUSE__
For love!"""

with gr.Blocks() as app:
    gr.Markdown("""
# 🐸 Tahm Kench Voice Synth

## Usage tips
- You may need to try the generation more than once to get a good result.  Like most generative AI, there is an element of randomness, controlled by the sampling parameters.
- ⚠️ **Apostrophe warning:** The model may get confused by apostrophes (`'`). Avoid them if possible.
- The in-browser play feature does not work on Safari, but you can download and play the clips locally.
- Enter one statement per line and the conversational segments will be stitched together.
- The model is prone to make more mistakes and hallucinate with longer generations. If this is an issue, try shorter segments.
- Use `__PAUSE__` on a line to insert a silent pause.
- We apply aggressive fade-in and fade-out to each conversational segment to suppress audio artifacts at the edges. If your output is getting unnaturally clipped or quiet at the start/end, try adjusting the fade duration below.
""")
    with gr.Row():
        input_text = gr.TextArea(lines=10, label="Input (multi-line)", value=DEFAULT_TEXT)
    with gr.Row():
        temp = gr.Slider(0.1, 1.5, value=0.3, step=0.05, label="Temperature")
        top_k = gr.Slider(1, 100, value=10, step=1, label="Top-K")
        top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
    with gr.Row():
        fade_ms = gr.Slider(0, 1000, value=200, step=10, label="Fade Duration (ms)")
        pause_ms = gr.Slider(0, 1000, value=300, step=10, label="Pause Duration (ms)")
    output_audio = gr.Audio(label="Generated Audio", type="numpy")
    run_button = gr.Button("🎤 Synthesize")

    run_button.click(
        infer,
        inputs=[input_text, temp, top_k, top_p, fade_ms, pause_ms],
        outputs=[output_audio],
    )

app.launch()