Szeyu's picture
Create app.py
64fd107 verified
raw
history blame
5.94 kB
import streamlit as st
from transformers import pipeline
from PIL import Image
import io, textwrap, numpy as np, soundfile as sf
# ------------------ Streamlit Page Configuration ------------------
st.set_page_config(
page_title="Picture to Story Magic", # App title on browser tab
page_icon="🦄", # Fun unicorn icon
layout="centered"
)
# ------------------ Custom CSS for a Colorful Background ------------------
st.markdown(
"""
<style>
body {
background-color: #FDEBD0; /* A soft pastel color */
}
</style>
""",
unsafe_allow_html=True
)
# ------------------ Playful Header for Young Users ------------------
st.markdown(
"""
<h1 style='text-align: center; color: #ff66cc;'>Picture to Story Magic!</h1>
<p style='text-align: center; font-size: 24px;'>
Hi little artist! Upload your picture and let us create a fun story just for you! 🎉
</p>
""",
unsafe_allow_html=True
)
# ------------------ Lazy Model Loading ------------------
def load_models():
"""
Lazy-load the required pipelines and store them in session state.
Pipelines:
1. Captioner: Generates descriptive text from an image using a lighter model.
2. Storyer: Generates a humorous children's story using aspis/gpt2-genre-story-generation.
3. TTS: Converts text into audio.
"""
if "captioner" not in st.session_state:
# Use the "base" version for faster/cost-effective captioning.
st.session_state.captioner = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base"
)
if "storyer" not in st.session_state:
st.session_state.storyer = pipeline(
"text-generation",
model="aspis/gpt2-genre-story-generation"
)
if "tts" not in st.session_state:
st.session_state.tts = pipeline(
"text-to-speech",
model="facebook/mms-tts-eng"
)
# ------------------ Caching Functions ------------------
@st.cache_data(show_spinner=False)
def get_caption(image_bytes):
"""
Convert the image bytes into a smaller image to speed up captioning,
then return the generated caption.
"""
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
# Resize the image (preserving aspect ratio) to only 256x256 for faster processing.
image.thumbnail((256, 256))
caption = st.session_state.captioner(image)[0]["generated_text"]
return caption
@st.cache_data(show_spinner=False)
def get_story(caption):
"""
Generate a humorous and engaging children's story using the caption.
The prompt instructs the model to produce a playful story (50-100 words).
We lower max_new_tokens to 80 so that it generates its text faster.
"""
prompt = (
f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
f"in third-person narrative, as if the author is playfully describing the scene in the image: {caption}. "
"Explicitly mention the exact venue or location (such as a park, school, or home), describe specific characters "
"(for example, a little girl named Lily or a boy named Jack), and detail the humorous actions they perform. "
"Ensure the story is playful, engaging, and ends with a complete sentence."
)
raw_story = st.session_state.storyer(
prompt,
max_new_tokens=80, # Reduced token generation for faster response
do_sample=True,
temperature=0.7,
top_p=0.9,
return_full_text=False
)[0]["generated_text"].strip()
words = raw_story.split()
return " ".join(words[:100])
@st.cache_data(show_spinner=False)
def get_audio(story):
"""
Convert the generated story text into audio.
The text is split into 300-character chunks to reduce repeated TTS calls,
the audio chunks are concatenated, and then stored in an in-memory WAV buffer.
"""
chunks = textwrap.wrap(story, width=300)
audio_chunks = [st.session_state.tts(chunk)["audio"].squeeze() for chunk in chunks]
audio = np.concatenate(audio_chunks)
buffer = io.BytesIO()
sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
buffer.seek(0)
return buffer
# ------------------ Main App Logic ------------------
uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
try:
load_models() # Ensure models are loaded once
image_bytes = uploaded_file.getvalue()
# Display the user-uploaded image
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
st.image(image, caption="Your Amazing Picture!", use_column_width=True)
st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)
if st.button("Story, Please!"):
with st.spinner("Generating caption..."):
caption = get_caption(image_bytes)
st.markdown("<h3 style='text-align: center;'>Caption:</h3>", unsafe_allow_html=True)
st.write(caption)
with st.spinner("Generating story..."):
story = get_story(caption)
st.markdown("<h3 style='text-align: center;'>Your Story:</h3>", unsafe_allow_html=True)
st.write(story)
with st.spinner("Generating audio..."):
audio_buffer = get_audio(story)
st.audio(audio_buffer, format="audio/wav", start_time=0)
st.markdown(
"<p style='text-align: center; font-weight: bold;'>Enjoy your magical story! 🎶</p>",
unsafe_allow_html=True
)
except Exception as e:
st.error("Oops! Something went wrong. Please try a different picture or check the file format!")
st.error(f"Error details: {e}")