Spaces:
Sleeping
Sleeping
# func.py ── utilities for Hugging Face Space | |
# Step1. Image to Text | |
from typing import Union | |
from pathlib import Path | |
from PIL import Image | |
from transformers import pipeline | |
# lazy-load caption model once | |
_captioner = None | |
def _get_captioner(): | |
global _captioner | |
if _captioner is None: | |
_captioner = pipeline( | |
"image-to-text", | |
model="Salesforce/blip-image-captioning-large" | |
) | |
return _captioner | |
def img2text(img: Union[Image.Image, str, Path]) -> str: | |
""" | |
Return a short English caption for an image. | |
Args: | |
img: PIL.Image, local path, or pathlib.Path. | |
Returns: | |
Caption string. | |
""" | |
# ensure PIL.Image | |
if not isinstance(img, Image.Image): | |
img = Image.open(img) | |
return _get_captioner()(img)[0]["generated_text"] | |
# ------------------------------------------------------------------- | |
# Step 2. Caption ➜ Children’s story (DeepSeek-R1 1.5 B) | |
# ------------------------------------------------------------------- | |
import torch, re | |
from transformers import pipeline | |
_GEN_MODEL = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
_PROMPT_TMPL = ( | |
"Write a funny and warm children's story (50-100 words) for ages 3-10, " | |
"fully and strictly based on this scene: {caption}\nStory:" | |
) | |
_generator = None | |
def _get_generator(): | |
"""Lazy-load DeepSeek generator once (GPU if available).""" | |
global _generator | |
if _generator is None: | |
_generator = pipeline( | |
"text-generation", | |
model=_GEN_MODEL, | |
device=0 if torch.cuda.is_available() else -1, | |
max_new_tokens=150, | |
do_sample=True, | |
top_p=0.9, | |
temperature=0.8, | |
no_repeat_ngram_size=4, # ← block 4-gram repeats | |
repetition_penalty=1.15 # ← soften copy-loops | |
) | |
return _generator | |
def _dedup_sentences(text: str) -> str: | |
"""Remove exact duplicate sentences while preserving order.""" | |
seen, cleaned = set(), [] | |
for sent in re.split(r'(?<=[.!?])\s+', text.strip()): | |
s = sent.strip() | |
if s and s not in seen: | |
cleaned.append(s) | |
seen.add(s) | |
return " ".join(cleaned) | |
def text2story(caption: str) -> str: | |
""" | |
Generate a ≤100-word children’s story from the image caption. | |
Args: | |
caption: scene description string. | |
Returns: | |
Story text (plain string, ≤100 words, no exact duplicate sentences). | |
""" | |
prompt = _PROMPT_TMPL.format(caption=caption) | |
raw = _get_generator()(prompt, return_full_text=False)[0]["generated_text"] | |
story = _dedup_sentences(raw) | |
# ensure ending punctuation | |
if story and story[-1] not in ".!?": | |
story += "." | |
# hard cap at 100 words | |
return " ".join(story.split()[:100]) | |
# Step3. Text to Audio | |
import numpy as np | |
import textwrap | |
import soundfile as sf | |
from transformers import pipeline | |
_TTS_MODEL = "facebook/mms-tts-eng" | |
_tts = None | |
def _get_tts(): | |
"""Lazy-load the TTS pipeline once.""" | |
global _tts | |
if _tts is None: | |
_tts = pipeline("text-to-speech", model=_TTS_MODEL) | |
return _tts | |
def story2audio(story: str, wav_path: str = "story.wav") -> str: | |
""" | |
Synthesize speech for a story and save as WAV. | |
Args: | |
story: Text returned by `text2story(...)`. | |
wav_path: Output file name. | |
Returns: | |
Path to the saved WAV file. | |
""" | |
tts = _get_tts() | |
chunks = textwrap.wrap(story, width=200) # long text → stable chunks | |
audio = np.concatenate([tts(c)["audio"].squeeze() | |
for c in chunks]) | |
sf.write(wav_path, audio, tts.model.config.sampling_rate) | |
return wav_path | |