LinkLinkWu commited on
Commit
75cc814
·
verified ·
1 Parent(s): 35beabe

Create func.py

Browse files
Files changed (1) hide show
  1. func.py +122 -0
func.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # func.py ── utilities for Hugging Face Space
2
+
3
+ # Step1. Image to Text
4
+ from typing import Union
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ from transformers import pipeline
8
+
9
+ # lazy-load caption model once
10
+ _captioner = None
11
+ def _get_captioner():
12
+ global _captioner
13
+ if _captioner is None:
14
+ _captioner = pipeline(
15
+ "image-to-text",
16
+ model="Salesforce/blip-image-captioning-large"
17
+ )
18
+ return _captioner
19
+
20
+ def img2text(img: Union[Image.Image, str, Path]) -> str:
21
+ """
22
+ Return a short English caption for an image.
23
+
24
+ Args:
25
+ img: PIL.Image, local path, or pathlib.Path.
26
+
27
+ Returns:
28
+ Caption string.
29
+ """
30
+ # ensure PIL.Image
31
+ if not isinstance(img, Image.Image):
32
+ img = Image.open(img)
33
+ return _get_captioner()(img)[0]["generated_text"]
34
+
35
+ # Step2. Text Generation (Based on Caption)
36
+ import torch
37
+ from transformers import AutoTokenizer, AutoModelForCausalLM
38
+
39
+ _MODEL_NAME = "aspis/gpt2-genre-story-generation"
40
+ _PROMPT = (
41
+ "Write a funny and warm children's story (50-100 words) for ages 3-10, "
42
+ "fully based on this scene: {caption}\nStory:"
43
+ )
44
+
45
+ _tokenizer, _model = None, None
46
+ def _load_story_model():
47
+ """Lazy-load tokenizer / model once."""
48
+ global _tokenizer, _model
49
+ if _model is None:
50
+ _tokenizer = AutoTokenizer.from_pretrained(_MODEL_NAME)
51
+ _model = AutoModelForCausalLM.from_pretrained(_MODEL_NAME)
52
+ if torch.cuda.is_available():
53
+ _model = _model.to("cuda")
54
+ return _tokenizer, _model
55
+
56
+
57
+ def text2story(caption: str) -> str:
58
+ """
59
+ Generate a 50-100-word children’s story from an image caption.
60
+
61
+ Args:
62
+ caption: Scene description string.
63
+
64
+ Returns:
65
+ Story text (≤100 words).
66
+ """
67
+ tok, mdl = _load_story_model()
68
+
69
+ prompt = _PROMPT.format(caption=caption)
70
+ inputs = tok(prompt, return_tensors="pt", add_special_tokens=False)
71
+ if mdl.device.type == "cuda":
72
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
73
+
74
+ gen_ids = mdl.generate(
75
+ **inputs,
76
+ max_new_tokens=150,
77
+ do_sample=True,
78
+ top_p=0.9,
79
+ temperature=0.8,
80
+ pad_token_id=tok.eos_token_id,
81
+ repetition_penalty=1.1
82
+ )[0]
83
+
84
+ # drop prompt, decode, keep ≤100 words, end at last period
85
+ story_ids = gen_ids[inputs["input_ids"].shape[-1]:]
86
+ story = tok.decode(story_ids, skip_special_tokens=True).strip()
87
+ story = story[: story.rfind(".") + 1] if "." in story else story
88
+ return " ".join(story.split()[:100])
89
+
90
+ # Step3. Text to Audio
91
+ import numpy as np
92
+ import textwrap
93
+ import soundfile as sf
94
+ from transformers import pipeline
95
+
96
+ _TTS_MODEL = "facebook/mms-tts-eng"
97
+ _tts = None
98
+ def _get_tts():
99
+ """Lazy-load the TTS pipeline once."""
100
+ global _tts
101
+ if _tts is None:
102
+ _tts = pipeline("text-to-speech", model=_TTS_MODEL)
103
+ return _tts
104
+
105
+
106
+ def story2audio(story: str, wav_path: str = "story.wav") -> str:
107
+ """
108
+ Synthesize speech for a story and save as WAV.
109
+
110
+ Args:
111
+ story: Text returned by `text2story(...)`.
112
+ wav_path: Output file name.
113
+
114
+ Returns:
115
+ Path to the saved WAV file.
116
+ """
117
+ tts = _get_tts()
118
+ chunks = textwrap.wrap(story, width=200) # long text → stable chunks
119
+ audio = np.concatenate([tts(c)["audio"].squeeze()
120
+ for c in chunks])
121
+ sf.write(wav_path, audio, tts.model.config.sampling_rate)
122
+ return wav_path