Szeyu's picture
Update app.py
b95df49 verified
raw
history blame
3.27 kB
# import part
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
็Ÿญๆ–‡ import tempfile
import os
from PIL import Image
import string
# Initialize pipelines with caching
@st.cache_resource
def load_pipelines():
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
return captioner, storyer, tts
captioner, storyer, tts = load_pipelines()
# Function part
# Function to generate content from an image
def generate_content(image):
pil_image = Image.open(image)
# Generate caption
caption = captioner(pil_image)[0]["generated_text"]
st.write("**๐ŸŒŸ What's in the picture: ๐ŸŒŸ**")
st.write(caption)
# Create prompt for story (unchanged)
prompt = (
f"Write a funny, warm children's story for ages 3-10, 50โ€“100 words, "
f"Completely and precisely centered on this scene {caption}\nStory:"
)
# Generate raw story
raw = storyer(
prompt,
max_new_tokens=150,
temperature=0.7,
top_p=0.9,
no_repeat_ngram_size=2,
return_full_text=False
)[0]["generated_text"].strip()
# Define allowed characters to keep (removes symbols like * and ~, and digits)
allowed_chars = string.ascii_letters + " .,!?\"'-"
# Clean the raw story by keeping only allowed characters
clean_raw = ''.join(c for c in raw if c in allowed_chars)
# Split into words and ensure at least 50 words, trim to 100 words
words = clean_raw.split()
if len(words) < 50:
words.extend("The children laughed and played happily, making new friends in the sunny park.".split())
story = " ".join(words[:100])
st.write("**๐Ÿ“– Your funny story: ๐Ÿ“–**")
st.write(story)
# Generate audio from cleaned story
chunks = textwrap.wrap(story, width=200)
audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
# Save audio to temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
temp_file_path = temp_file.name
return caption, story, temp_file_path
# Streamlit UI
st.title("โœจ Magic Story Maker โœจ")
st.markdown("Upload a picture to make a funny story and hear it too! ๐Ÿ“ธ")
uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])
if uploaded_image is None:
st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here! ๐Ÿ“ท", use_column_width=True)
else:
st.image(uploaded_image, caption="Your Picture ๐ŸŒŸ", use_column_width=True)
if st.button("โœจ Make My Story! โœจ"):
if uploaded_image is None:
st.warning("Please upload a picture first! ๐Ÿ“ธ")
else:
with st.spinner("๐Ÿ”ฎ Creating your magical story..."):
caption, story, audio_path = generate_content(uploaded_image)
st.success("๐ŸŽ‰ Your story is ready! ๐ŸŽ‰")
st.audio(audio_path, format="audio/wav")
os.remove(audio_path)