# func.py  ── utilities for Hugging Face Space

# Step1. Image to Text
from typing import Union
from pathlib import Path
from PIL import Image
from transformers import pipeline

# lazy-load caption model once
_captioner = None
def _get_captioner():
    global _captioner
    if _captioner is None:
        _captioner = pipeline(
            "image-to-text",
            model="Salesforce/blip-image-captioning-large"
        )
    return _captioner

def img2text(img: Union[Image.Image, str, Path]) -> str:
    """
    Return a short English caption for an image.

    Args:
        img: PIL.Image, local path, or pathlib.Path.

    Returns:
        Caption string.
    """
    # ensure PIL.Image
    if not isinstance(img, Image.Image):
        img = Image.open(img)
    return _get_captioner()(img)[0]["generated_text"]

# -------------------------------------------------------------------
# Step 2.  Caption ➜ Children’s story   (DeepSeek-R1 1.5 B)
# -------------------------------------------------------------------
import torch, re
from transformers import pipeline

_GEN_MODEL   = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
_PROMPT_TMPL = (
    "Write a funny and warm children's story (50-100 words) for ages 3-10, "
    "fully and strictly based on this scene: {caption}\nStory:"
)

_generator = None
def _get_generator():
    """Lazy-load DeepSeek generator once (GPU if available)."""
    global _generator
    if _generator is None:
        _generator = pipeline(
            "text-generation",
            model=_GEN_MODEL,
            device=0 if torch.cuda.is_available() else -1,
            max_new_tokens=150,
            do_sample=True,
            top_p=0.9,
            temperature=0.8,
            no_repeat_ngram_size=4,    # ← block 4-gram repeats
            repetition_penalty=1.15    # ← soften copy-loops
        )
    return _generator


def _dedup_sentences(text: str) -> str:
    """Remove exact duplicate sentences while preserving order."""
    seen, cleaned = set(), []
    for sent in re.split(r'(?<=[.!?])\s+', text.strip()):
        s = sent.strip()
        if s and s not in seen:
            cleaned.append(s)
            seen.add(s)
    return " ".join(cleaned)


def text2story(caption: str) -> str:
    """
    Generate a ≤100-word children’s story from the image caption.

    Args:
        caption: scene description string.

    Returns:
        Story text (plain string, ≤100 words, no exact duplicate sentences).
    """
    prompt = _PROMPT_TMPL.format(caption=caption)
    raw    = _get_generator()(prompt, return_full_text=False)[0]["generated_text"]

    story  = _dedup_sentences(raw)

    # ensure ending punctuation
    if story and story[-1] not in ".!?":
        story += "."

    # hard cap at 100 words
    return " ".join(story.split()[:100])

# Step3. Text to Audio
import numpy as np
import textwrap
import soundfile as sf
from transformers import pipeline

_TTS_MODEL = "facebook/mms-tts-eng"
_tts = None
def _get_tts():
    """Lazy-load the TTS pipeline once."""
    global _tts
    if _tts is None:
        _tts = pipeline("text-to-speech", model=_TTS_MODEL)
    return _tts


def story2audio(story: str, wav_path: str = "story.wav") -> str:
    """
    Synthesize speech for a story and save as WAV.

    Args:
        story: Text returned by `text2story(...)`.
        wav_path: Output file name.

    Returns:
        Path to the saved WAV file.
    """
    tts = _get_tts()
    chunks = textwrap.wrap(story, width=200)               # long text → stable chunks
    audio  = np.concatenate([tts(c)["audio"].squeeze()
                             for c in chunks])
    sf.write(wav_path, audio, tts.model.config.sampling_rate)
    return wav_path