|
""" |
|
Streamlit application that generates children's stories from images with audio narration. |
|
Uses Hugging Face transformers for image captioning, story generation, and text-to-speech. |
|
""" |
|
|
|
import streamlit as st |
|
from transformers import pipeline |
|
import textwrap |
|
import numpy as np |
|
import soundfile as sf |
|
import tempfile |
|
import os |
|
from PIL import Image |
|
|
|
|
|
MAX_STORY_WORDS = 100 |
|
TEXT_CHUNK_WIDTH = 200 |
|
AUDIO_SAMPLE_RATE = 16000 |
|
|
|
@st.cache_resource |
|
def load_ml_pipelines(): |
|
""" |
|
Load and cache ML models for image captioning, story generation, and text-to-speech. |
|
|
|
Returns: |
|
tuple: Three pipeline objects for: |
|
- Image-to-text (captioning) |
|
- Text generation (story) |
|
- Text-to-speech |
|
""" |
|
caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") |
|
story_pipeline = pipeline("text-generation", model="aspis/gpt2-genre-story-generation") |
|
tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng") |
|
|
|
return caption_pipeline, story_pipeline, tts_pipeline |
|
|
|
|
|
image_caption_pipeline, story_gen_pipeline, text_to_speech_pipeline = load_ml_pipelines() |
|
|
|
def generate_story_content(uploaded_image): |
|
""" |
|
Process an image to generate caption, story, and audio narration. |
|
|
|
Args: |
|
uploaded_image (UploadedFile): Streamlit file uploader object |
|
|
|
Returns: |
|
tuple: (caption_text, story_text, temp_audio_path) |
|
""" |
|
|
|
pil_image = Image.open(uploaded_image) |
|
|
|
|
|
caption_result = image_caption_pipeline(pil_image)[0] |
|
caption_text = caption_result["generated_text"] |
|
st.write("**Caption:**", caption_text) |
|
|
|
|
|
story_prompt = ( |
|
f"Write a funny, warm children's story for ages 3-10, 50–100 words, " |
|
f"in third-person narrative, that describes this scene exactly: {caption_text} " |
|
f"mention the exact place or venue within {caption_text}" |
|
) |
|
|
|
|
|
story_output = story_gen_pipeline( |
|
story_prompt, |
|
max_new_tokens=150, |
|
temperature=0.7, |
|
top_p=0.9, |
|
no_repeat_ngram_size=2, |
|
return_full_text=False |
|
)[0]["generated_text"].strip() |
|
|
|
|
|
story_words = story_output.split() |
|
trimmed_story = " ".join(story_words[:MAX_STORY_WORDS]) |
|
st.write("**Story:**", trimmed_story) |
|
|
|
|
|
story_chunks = textwrap.wrap(trimmed_story, width=TEXT_CHUNK_WIDTH) |
|
|
|
|
|
audio_segments = [ |
|
text_to_speech_pipeline(chunk)["audio"].squeeze() |
|
for chunk in story_chunks |
|
] |
|
concatenated_audio = np.concatenate(audio_segments) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file: |
|
sf.write(temp_audio_file.name, concatenated_audio, samplerate=AUDIO_SAMPLE_RATE) |
|
temp_audio_path = temp_audio_file.name |
|
|
|
return caption_text, trimmed_story, temp_audio_path |
|
|
|
|
|
def main(): |
|
"""Main Streamlit application layout and interaction logic.""" |
|
st.title("📖 Image to Children's Story with Audio Narration") |
|
st.markdown(""" |
|
Upload an image to generate: |
|
1. A descriptive caption |
|
2. A children's story (ages 3-10) |
|
3. Audio narration of the story |
|
""") |
|
|
|
image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"]) |
|
|
|
if image_file is not None: |
|
st.image(image_file, caption="Uploaded Image", use_column_width=True) |
|
|
|
if st.button("Generate Story and Audio"): |
|
with st.spinner("Creating magical story..."): |
|
try: |
|
caption, story, audio_path = generate_story_content(image_file) |
|
st.success("Here's your generated story!") |
|
|
|
|
|
st.audio(audio_path, format="audio/wav") |
|
|
|
|
|
os.remove(audio_path) |
|
except Exception as e: |
|
st.error(f"Something went wrong: {str(e)}") |
|
if 'audio_path' in locals(): |
|
os.remove(audio_path) |
|
|
|
if __name__ == "__main__": |
|
main() |