szeandlinkProject_Testing

Sleeping

App Files Files Community

szeandlinkProject_Testing / app.py

Szeyu

Update app.py

17f0914 verified 5 months ago

raw

history blame

4.75 kB

	"""
	Streamlit application that generates children's stories from images with audio narration.
	Uses Hugging Face transformers for image captioning, story generation, and text-to-speech.
	"""

	import streamlit as st
	from transformers import pipeline
	import textwrap
	import numpy as np
	import soundfile as sf
	import tempfile
	import os
	from PIL import Image

	# Constants
	MAX_STORY_WORDS = 100
	TEXT_CHUNK_WIDTH = 200 # Characters per chunk for text-to-speech processing
	AUDIO_SAMPLE_RATE = 16000 # 16kHz sampling rate for audio output

	@st.cache_resource
	def load_ml_pipelines():
	"""
	Load and cache ML models for image captioning, story generation, and text-to-speech.

	Returns:
	tuple: Three pipeline objects for:
	- Image-to-text (captioning)
	- Text generation (story)
	- Text-to-speech
	"""
	caption_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
	story_pipeline = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
	tts_pipeline = pipeline("text-to-speech", model="facebook/mms-tts-eng")

	return caption_pipeline, story_pipeline, tts_pipeline

	# Load ML pipelines once and cache them
	image_caption_pipeline, story_gen_pipeline, text_to_speech_pipeline = load_ml_pipelines()

	def generate_story_content(uploaded_image):
	"""
	Process an image to generate caption, story, and audio narration.

	Args:
	uploaded_image (UploadedFile): Streamlit file uploader object

	Returns:
	tuple: (caption_text, story_text, temp_audio_path)
	"""
	# Convert uploaded image to PIL format
	pil_image = Image.open(uploaded_image)

	# Generate image caption
	caption_result = image_caption_pipeline(pil_image)[0]
	caption_text = caption_result["generated_text"]
	st.write("Caption:", caption_text)

	# Create story generation prompt
	story_prompt = (
	f"Write a funny, warm children's story for ages 3-10, 50–100 words, "
	f"in third-person narrative, that describes this scene exactly: {caption_text} "
	f"mention the exact place or venue within {caption_text}"
	)

	# Generate story text
	story_output = story_gen_pipeline(
	story_prompt,
	max_new_tokens=150,
	temperature=0.7, # Controls randomness (lower = more deterministic)
	top_p=0.9, # Nucleus sampling probability
	no_repeat_ngram_size=2, # Prevent repeating word pairs
	return_full_text=False
	)[0]["generated_text"].strip()

	# Trim story to maximum allowed words
	story_words = story_output.split()
	trimmed_story = " ".join(story_words[:MAX_STORY_WORDS])
	st.write("Story:", trimmed_story)

	# Split story into chunks for text-to-speech processing
	story_chunks = textwrap.wrap(trimmed_story, width=TEXT_CHUNK_WIDTH)

	# Generate audio for each chunk and concatenate
	audio_segments = [
	text_to_speech_pipeline(chunk)["audio"].squeeze()
	for chunk in story_chunks
	]
	concatenated_audio = np.concatenate(audio_segments)

	# Create temporary audio file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
	sf.write(temp_audio_file.name, concatenated_audio, samplerate=AUDIO_SAMPLE_RATE)
	temp_audio_path = temp_audio_file.name

	return caption_text, trimmed_story, temp_audio_path

	# Streamlit application interface
	def main():
	"""Main Streamlit application layout and interaction logic."""
	st.title("📖 Image to Children's Story with Audio Narration")
	st.markdown("""
	Upload an image to generate:
	1. A descriptive caption
	2. A children's story (ages 3-10)
	3. Audio narration of the story
	""")

	image_file = st.file_uploader("Choose an image", type=["jpg", "jpeg", "png"])

	if image_file is not None:
	st.image(image_file, caption="Uploaded Image", use_column_width=True)

	if st.button("Generate Story and Audio"):
	with st.spinner("Creating magical story..."):
	try:
	caption, story, audio_path = generate_story_content(image_file)
	st.success("Here's your generated story!")

	# Display audio player
	st.audio(audio_path, format="audio/wav")

	# Clean up temporary audio file
	os.remove(audio_path)
	except Exception as e:
	st.error(f"Something went wrong: {str(e)}")
	if 'audio_path' in locals():
	os.remove(audio_path)

	if __name__ == "__main__":
	main()