szeandlinkProject_Testing

Sleeping

App Files Files Community

szeandlinkProject_Testing / app.py

Szeyu

Create app.py

64fd107 verified 4 months ago

raw

history blame

5.94 kB

	import streamlit as st
	from transformers import pipeline
	from PIL import Image
	import io, textwrap, numpy as np, soundfile as sf

	# ------------------ Streamlit Page Configuration ------------------
	st.set_page_config(
	page_title="Picture to Story Magic", # App title on browser tab
	page_icon="🦄", # Fun unicorn icon
	layout="centered"
	)

	# ------------------ Custom CSS for a Colorful Background ------------------
	st.markdown(
	"""
	<style>
	body {
	background-color: #FDEBD0; /* A soft pastel color */
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# ------------------ Playful Header for Young Users ------------------
	st.markdown(
	"""
	<h1 style='text-align: center; color: #ff66cc;'>Picture to Story Magic!</h1>
	<p style='text-align: center; font-size: 24px;'>
	Hi little artist! Upload your picture and let us create a fun story just for you! 🎉
	</p>
	""",
	unsafe_allow_html=True
	)

	# ------------------ Lazy Model Loading ------------------
	def load_models():
	"""
	Lazy-load the required pipelines and store them in session state.
	Pipelines:
	1. Captioner: Generates descriptive text from an image using a lighter model.
	2. Storyer: Generates a humorous children's story using aspis/gpt2-genre-story-generation.
	3. TTS: Converts text into audio.
	"""
	if "captioner" not in st.session_state:
	# Use the "base" version for faster/cost-effective captioning.
	st.session_state.captioner = pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base"
	)
	if "storyer" not in st.session_state:
	st.session_state.storyer = pipeline(
	"text-generation",
	model="aspis/gpt2-genre-story-generation"
	)
	if "tts" not in st.session_state:
	st.session_state.tts = pipeline(
	"text-to-speech",
	model="facebook/mms-tts-eng"
	)

	# ------------------ Caching Functions ------------------
	@st.cache_data(show_spinner=False)
	def get_caption(image_bytes):
	"""
	Convert the image bytes into a smaller image to speed up captioning,
	then return the generated caption.
	"""
	image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	# Resize the image (preserving aspect ratio) to only 256x256 for faster processing.
	image.thumbnail((256, 256))
	caption = st.session_state.captioner(image)[0]["generated_text"]
	return caption

	@st.cache_data(show_spinner=False)
	def get_story(caption):
	"""
	Generate a humorous and engaging children's story using the caption.
	The prompt instructs the model to produce a playful story (50-100 words).
	We lower max_new_tokens to 80 so that it generates its text faster.
	"""
	prompt = (
	f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
	f"in third-person narrative, as if the author is playfully describing the scene in the image: {caption}. "
	"Explicitly mention the exact venue or location (such as a park, school, or home), describe specific characters "
	"(for example, a little girl named Lily or a boy named Jack), and detail the humorous actions they perform. "
	"Ensure the story is playful, engaging, and ends with a complete sentence."
	)
	raw_story = st.session_state.storyer(
	prompt,
	max_new_tokens=80, # Reduced token generation for faster response
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	return_full_text=False
	)[0]["generated_text"].strip()
	words = raw_story.split()
	return " ".join(words[:100])

	@st.cache_data(show_spinner=False)
	def get_audio(story):
	"""
	Convert the generated story text into audio.
	The text is split into 300-character chunks to reduce repeated TTS calls,
	the audio chunks are concatenated, and then stored in an in-memory WAV buffer.
	"""
	chunks = textwrap.wrap(story, width=300)
	audio_chunks = [st.session_state.tts(chunk)["audio"].squeeze() for chunk in chunks]
	audio = np.concatenate(audio_chunks)
	buffer = io.BytesIO()
	sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
	buffer.seek(0)
	return buffer

	# ------------------ Main App Logic ------------------
	uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
	if uploaded_file is not None:
	try:
	load_models() # Ensure models are loaded once
	image_bytes = uploaded_file.getvalue()
	# Display the user-uploaded image
	image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
	st.image(image, caption="Your Amazing Picture!", use_column_width=True)
	st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)

	if st.button("Story, Please!"):
	with st.spinner("Generating caption..."):
	caption = get_caption(image_bytes)
	st.markdown("<h3 style='text-align: center;'>Caption:</h3>", unsafe_allow_html=True)
	st.write(caption)

	with st.spinner("Generating story..."):
	story = get_story(caption)
	st.markdown("<h3 style='text-align: center;'>Your Story:</h3>", unsafe_allow_html=True)
	st.write(story)

	with st.spinner("Generating audio..."):
	audio_buffer = get_audio(story)
	st.audio(audio_buffer, format="audio/wav", start_time=0)
	st.markdown(
	"<p style='text-align: center; font-weight: bold;'>Enjoy your magical story! 🎶</p>",
	unsafe_allow_html=True
	)
	except Exception as e:
	st.error("Oops! Something went wrong. Please try a different picture or check the file format!")
	st.error(f"Error details: {e}")