User_Input_Pipeline_1

Sleeping

App Files Files Community

LinkLinkWu commited on Apr 30

Commit

75cc814

verified ·

1 Parent(s): 35beabe

Create func.py

Browse files

Files changed (1) hide show

func.py +122 -0

func.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# func.py  ── utilities for Hugging Face Space
+# Step1. Image to Text
+from typing import Union
+from pathlib import Path
+from PIL import Image
+from transformers import pipeline
+# lazy-load caption model once
+_captioner = None
+def _get_captioner():
+    global _captioner
+    if _captioner is None:
+        _captioner = pipeline(
+            "image-to-text",
+            model="Salesforce/blip-image-captioning-large"
+        )
+    return _captioner
+def img2text(img: Union[Image.Image, str, Path]) -> str:
+    """
+    Return a short English caption for an image.
+    Args:
+        img: PIL.Image, local path, or pathlib.Path.
+    Returns:
+        Caption string.
+    """
+    # ensure PIL.Image
+    if not isinstance(img, Image.Image):
+        img = Image.open(img)
+    return _get_captioner()(img)[0]["generated_text"]
+# Step2. Text Generation (Based on Caption)
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+_MODEL_NAME = "aspis/gpt2-genre-story-generation"
+_PROMPT     = (
+    "Write a funny and warm children's story (50-100 words) for ages 3-10, "
+    "fully based on this scene: {caption}\nStory:"
+)
+_tokenizer, _model = None, None
+def _load_story_model():
+    """Lazy-load tokenizer / model once."""
+    global _tokenizer, _model
+    if _model is None:
+        _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
+        _model     = AutoModelForCausalLM.from_pretrained(_MODEL_NAME)
+        if torch.cuda.is_available():
+            _model = _model.to("cuda")
+    return _tokenizer, _model
+def text2story(caption: str) -> str:
+    """
+    Generate a 50-100-word children’s story from an image caption.
+    Args:
+        caption: Scene description string.
+    Returns:
+        Story text (≤100 words).
+    """
+    tok, mdl = _load_story_model()
+    prompt  = _PROMPT.format(caption=caption)
+    inputs  = tok(prompt, return_tensors="pt", add_special_tokens=False)
+    if mdl.device.type == "cuda":
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    gen_ids = mdl.generate(
+        **inputs,
+        max_new_tokens=150,
+        do_sample=True,
+        top_p=0.9,
+        temperature=0.8,
+        pad_token_id=tok.eos_token_id,
+        repetition_penalty=1.1
+    )[0]
+    # drop prompt, decode, keep ≤100 words, end at last period
+    story_ids = gen_ids[inputs["input_ids"].shape[-1]:]
+    story     = tok.decode(story_ids, skip_special_tokens=True).strip()
+    story     = story[: story.rfind(".") + 1] if "." in story else story
+    return " ".join(story.split()[:100])
+# Step3. Text to Audio
+import numpy as np
+import textwrap
+import soundfile as sf
+from transformers import pipeline
+_TTS_MODEL = "facebook/mms-tts-eng"
+_tts = None
+def _get_tts():
+    """Lazy-load the TTS pipeline once."""
+    global _tts
+    if _tts is None:
+        _tts = pipeline("text-to-speech", model=_TTS_MODEL)
+    return _tts
+def story2audio(story: str, wav_path: str = "story.wav") -> str:
+    """
+    Synthesize speech for a story and save as WAV.
+    Args:
+        story: Text returned by `text2story(...)`.
+        wav_path: Output file name.
+    Returns:
+        Path to the saved WAV file.
+    """
+    tts = _get_tts()
+    chunks = textwrap.wrap(story, width=200)               # long text → stable chunks
+    audio  = np.concatenate([tts(c)["audio"].squeeze()
+                             for c in chunks])
+    sf.write(wav_path, audio, tts.model.config.sampling_rate)
+    return wav_path