Szeyu's picture
Update app.py
17f0914 verified
raw
history blame
4.75 kB
"""
Streamlit application that generates children's stories from images with audio narration.
Uses Hugging Face transformers for image captioning, story generation, and text-to-speech.
"""
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
import tempfile
import os
from PIL import Image
# Constants
MAX_STORY_WORDS = 100
TEXT_CHUNK_WIDTH = 200 # Characters per chunk for text-to-speech processing
AUDIO_SAMPLE_RATE = 16000 # 16kHz sampling rate for audio output
@st.cache_resource
def load_ml_pipelines():
"""
Load and cache ML models for image captioning, story generation, and text-to-speech.
Returns:
tuple: Three pipeline objects for:
- Image-to-text (captioning)
- Text generation (story)
- Text-to-speech
"""
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
story_pipeline = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")
return caption_pipeline, story_pipeline, tts_pipeline
# Load ML pipelines once and cache them
image_caption_pipeline, story_gen_pipeline, text_to_speech_pipeline = load_ml_pipelines()
def generate_story_content(uploaded_image):
"""
Process an image to generate caption, story, and audio narration.
Args:
uploaded_image (UploadedFile): Streamlit file uploader object
Returns:
tuple: (caption_text, story_text, temp_audio_path)
"""
# Convert uploaded image to PIL format
pil_image = Image.open(uploaded_image)
# Generate image caption
caption_result = image_caption_pipeline(pil_image)[0]
caption_text = caption_result["generated_text"]
st.write("**Caption:**", caption_text)
# Create story generation prompt
story_prompt = (
f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
f"in third-person narrative, that describes this scene exactly: {caption_text} "
f"mention the exact place or venue within {caption_text}"
)
# Generate story text
story_output = story_gen_pipeline(
story_prompt,
max_new_tokens=150,
temperature=0.7, # Controls randomness (lower = more deterministic)
top_p=0.9, # Nucleus sampling probability
no_repeat_ngram_size=2, # Prevent repeating word pairs
return_full_text=False
)[0]["generated_text"].strip()
# Trim story to maximum allowed words
story_words = story_output.split()
trimmed_story = " ".join(story_words[:MAX_STORY_WORDS])
st.write("**Story:**", trimmed_story)
# Split story into chunks for text-to-speech processing
story_chunks = textwrap.wrap(trimmed_story, width=TEXT_CHUNK_WIDTH)
# Generate audio for each chunk and concatenate
audio_segments = [
text_to_speech_pipeline(chunk)["audio"].squeeze()
for chunk in story_chunks
]
concatenated_audio = np.concatenate(audio_segments)
# Create temporary audio file
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
sf.write(temp_audio_file.name, concatenated_audio, samplerate=AUDIO_SAMPLE_RATE)
temp_audio_path = temp_audio_file.name
return caption_text, trimmed_story, temp_audio_path
# Streamlit application interface
def main():
"""Main Streamlit application layout and interaction logic."""
st.title("📖 Image to Children's Story with Audio Narration")
st.markdown("""
Upload an image to generate:
1. A descriptive caption
2. A children's story (ages 3-10)
3. Audio narration of the story
""")
image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])
if image_file is not None:
st.image(image_file, caption="Uploaded Image", use_column_width=True)
if st.button("Generate Story and Audio"):
with st.spinner("Creating magical story..."):
try:
caption, story, audio_path = generate_story_content(image_file)
st.success("Here's your generated story!")
# Display audio player
st.audio(audio_path, format="audio/wav")
# Clean up temporary audio file
os.remove(audio_path)
except Exception as e:
st.error(f"Something went wrong: {str(e)}")
if 'audio_path' in locals():
os.remove(audio_path)
if __name__ == "__main__":
main()