import queue
import threading
import spaces
import os
import io
import soundfile as sf
import gradio as gr
import numpy as np
import torch
from transformers import set_seed
from huggingface_hub import InferenceClient
from kokoro import KModel, KPipeline

# -----------------------------------------------------------------------------
# Get podcast subject
# -----------------------------------------------------------------------------
from papers import PaperManager

paper_manager = PaperManager()
top_papers = paper_manager.get_top_content()

PODCAST_SUBJECT = list(top_papers.values())[0]

# -----------------------------------------------------------------------------
# LLM that writes the script (unchanged)
# -----------------------------------------------------------------------------
client = InferenceClient(
    "meta-llama/Llama-3.3-70B-Instruct",
    provider="cerebras",
    token=os.getenv("HF_TOKEN"),
)


def generate_podcast_text(subject: str) -> str:
    """Ask the LLM for a script of a podcast given by two hosts."""
    prompt = f"""Generate the script of "Open Paper review", a podcast told by 2 hosts about.
    
Here is the topic: it's the top trending paper on Hugging Face daily papers today. You will need to analyze it by bringing profound insights.

{subject}
The podcast should be an insightful discussion, with some amount of playful banter.
Separate dialog as follows, each replica in one line prefaced by the host number, [S1] for the male host and [S2] for the female host, for instance:
[S1] Hello, how are you?
[S2] I'm good, thank you. How are you?
[S1] I'm good, thank you.
[S2] Great.
The podcast should last around 5 minutes.
"""
    response = client.chat_completion(
        [{"role": "user", "content": prompt[:1000]}],
        max_tokens=8156,
    )
    return response.choices[0].message.content

# -----------------------------------------------------------------------------
# Kokoro TTS
# -----------------------------------------------------------------------------
CUDA_AVAILABLE = torch.cuda.is_available()

kmodel = KModel().to("cuda" if CUDA_AVAILABLE else "cpu").eval()
kpipeline = KPipeline(lang_code="a")  # English voices

MALE_VOICE = "am_michael"  # [S1]
FEMALE_VOICE = "af_heart"  # [S2]

# Pre‑warm voices to avoid first‑call latency
for v in (MALE_VOICE, FEMALE_VOICE):
    kpipeline.load_voice(v)

# -----------------------------------------------------------------------------
# Audio generation system with queue
# -----------------------------------------------------------------------------

audio_queue: queue.Queue[tuple[int, np.ndarray] | None] = queue.Queue()
stop_signal = threading.Event()

@spaces.GPU
def process_audio_chunks(podcast_text: str, speed: float = 1.0) -> None:
    """Read each line, pick voice via tag, send chunks to the queue."""
    lines = [l for l in podcast_text.strip().splitlines() if l.strip()]

    pipeline = kpipeline
    pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
    pipeline_voice_male = pipeline.load_voice(MALE_VOICE)

    for line in lines:
        if stop_signal.is_set():
            break

        # Expect "[S1] ..." or "[S2] ..."
        if line.startswith("[S1]"):
            pipeline_voice = pipeline_voice_male
            voice = MALE_VOICE
            utterance = line[len("[S1]"):].strip()
        elif line.startswith("[S2]"):
            pipeline_voice = pipeline_voice_female
            voice = FEMALE_VOICE
            utterance = line[len("[S2]"):].strip()
        else:  # fallback
            pipeline_voice = pipeline_voice_female
            voice = FEMALE_VOICE
            utterance = line

        first = True
        for _, ps, _ in pipeline(utterance, voice, speed):
            ref_s = pipeline_voice[len(ps) - 1]
            audio = kmodel(ps, ref_s, speed)
            audio_queue.put((24000, audio.numpy()))
            audio_numpy = audio.numpy()
            if first:
                first = False
                audio_queue.put((24000, torch.zeros(1).numpy()))
    audio_queue.put(None)  # Signal end of stream


def stream_audio_generator(podcast_text: str):
    stop_signal.clear()
    threading.Thread(target=process_audio_chunks, args=(podcast_text,)).start()

    while True:
        chunk = audio_queue.get()
        if chunk is None:
            break
        sr, data = chunk

        buf = io.BytesIO()
        sf.write(buf, data, sr, format="wav")
        buf.seek(0)
        yield buf.getvalue(), "Generating podcast..."


def stop_generation():
    stop_signal.set()
    return "Generation stopped"


def generate_podcast():
    return generate_podcast_text(PODCAST_SUBJECT)


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# NotebookLM Podcast Generator")

    with gr.Row():
        with gr.Column(scale=2):
            # gr.Markdown(f"## Current Topic: {PODCAST_SUBJECT}")
            gr.Markdown(
                "This app generates a podcast discussion between two hosts about the specified topic."
            )

            generate_btn = gr.Button("Generate Podcast Script", variant="primary")
            podcast_output = gr.Textbox(label="Generated Podcast Script", lines=15)

            gr.Markdown("## Audio Preview")
            gr.Markdown("Click below to hear the podcast with realistic voices:")

            with gr.Row():
                start_audio_btn = gr.Button("▶️ Generate Podcast", variant="secondary")
                stop_btn = gr.Button("⏹️ Stop", variant="stop")

            audio_output = gr.Audio(label="Podcast Audio", streaming=True)
            status_text = gr.Textbox(label="Status", visible=True)

    generate_btn.click(fn=generate_podcast, outputs=podcast_output)

    start_audio_btn.click(fn=stream_audio_generator, inputs=podcast_output, outputs=[audio_output, status_text])
    stop_btn.click(fn=stop_generation, outputs=status_text)

if __name__ == "__main__":
    demo.queue().launch()