# func.py ── utilities for Hugging Face Space # Step1. Image to Text from typing import Union from pathlib import Path from PIL import Image from transformers import pipeline # lazy-load caption model once _captioner = None def _get_captioner(): global _captioner if _captioner is None: _captioner = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-large" ) return _captioner def img2text(img: Union[Image.Image, str, Path]) -> str: """ Return a short English caption for an image. Args: img: PIL.Image, local path, or pathlib.Path. Returns: Caption string. """ # ensure PIL.Image if not isinstance(img, Image.Image): img = Image.open(img) return _get_captioner()(img)[0]["generated_text"] # ------------------------------------------------------------------- # Step 2. Caption ➜ Children’s story (DeepSeek-R1 1.5 B) # ------------------------------------------------------------------- import torch, re from transformers import pipeline _GEN_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" _PROMPT_TMPL = ( "Write a funny and warm children's story (50-100 words) for ages 3-10, " "fully and strictly based on this scene: {caption}\nStory:" ) _generator = None def _get_generator(): """Lazy-load DeepSeek generator once (GPU if available).""" global _generator if _generator is None: _generator = pipeline( "text-generation", model=_GEN_MODEL, device=0 if torch.cuda.is_available() else -1, max_new_tokens=150, do_sample=True, top_p=0.9, temperature=0.8, no_repeat_ngram_size=4, # ← block 4-gram repeats repetition_penalty=1.15 # ← soften copy-loops ) return _generator def _dedup_sentences(text: str) -> str: """Remove exact duplicate sentences while preserving order.""" seen, cleaned = set(), [] for sent in re.split(r'(?<=[.!?])\s+', text.strip()): s = sent.strip() if s and s not in seen: cleaned.append(s) seen.add(s) return " ".join(cleaned) def text2story(caption: str) -> str: """ Generate a ≤100-word children’s story from the image caption. Args: caption: scene description string. Returns: Story text (plain string, ≤100 words, no exact duplicate sentences). """ prompt = _PROMPT_TMPL.format(caption=caption) raw = _get_generator()(prompt, return_full_text=False)[0]["generated_text"] story = _dedup_sentences(raw) # ensure ending punctuation if story and story[-1] not in ".!?": story += "." # hard cap at 100 words return " ".join(story.split()[:100]) # Step3. Text to Audio import numpy as np import textwrap import soundfile as sf from transformers import pipeline _TTS_MODEL = "facebook/mms-tts-eng" _tts = None def _get_tts(): """Lazy-load the TTS pipeline once.""" global _tts if _tts is None: _tts = pipeline("text-to-speech", model=_TTS_MODEL) return _tts def story2audio(story: str, wav_path: str = "story.wav") -> str: """ Synthesize speech for a story and save as WAV. Args: story: Text returned by `text2story(...)`. wav_path: Output file name. Returns: Path to the saved WAV file. """ tts = _get_tts() chunks = textwrap.wrap(story, width=200) # long text → stable chunks audio = np.concatenate([tts(c)["audio"].squeeze() for c in chunks]) sf.write(wav_path, audio, tts.model.config.sampling_rate) return wav_path