File size: 4,832 Bytes
bc7ea9a 29e42d5 b9ea623 29e42d5 37631cc f22469a 29e42d5 f22469a 29e42d5 e35a81f df08c46 e35a81f 29e42d5 e35a81f 29e42d5 bc7ea9a 890cd41 bc7ea9a 890cd41 bc7ea9a 890cd41 bc7ea9a 890cd41 bc7ea9a 4cf543b e35a81f 822643b 4cf543b 29e42d5 4cf543b e35a81f 890cd41 4cf543b 04471ed 29e42d5 bc7ea9a e35a81f e9ee97c 019caa7 29e42d5 f22469a bc7ea9a 822643b 4cf543b 29e42d5 4cf543b 890cd41 4cf543b e35a81f 4cf543b 890cd41 e35a81f 4cf543b 29e42d5 4cf543b 890cd41 4cf543b 29e42d5 bc7ea9a 822643b 29e42d5 f22469a 822643b 8b6a9db 822643b 8b6a9db 59b54a5 822643b e35a81f 822643b e35a81f 59b54a5 890cd41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import re
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
import tempfile
import os
from PIL import Image
import string
# Initialize pipelines with caching
@st.cache_resource
def load_pipelines():
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
return captioner, storyer, tts
captioner, storyer, tts = load_pipelines()
def clean_generated_story(raw_story: str) -> str:
"""
Cleans the generated story by:
1. Removing URLs.
2. Removing digits.
3. Removing words likely to be random letter combinations based on having no vowels.
4. Removing single-letter words unless allowed (such as 'a' or 'I').
"""
# Remove URLs starting with http://, https://, or www.
no_urls = re.sub(r'\b(?:https?://|www\.)\S+\b', '', raw_story)
# Remove domain names without protocol (e.g., erskybooks.com)
no_urls = re.sub(r'\b\w+\.(com|net|org|co\.uk|ca\.us|me)\b', '', no_urls)
# Remove all digits
story_without_numbers = re.sub(r'\d+', '', no_urls)
vowels = set('aeiouAEIOU')
def is_valid_word(word: str) -> bool:
# Allow "a" and "I" for single-letter words
if len(word) == 1 and word.lower() not in ['a', 'i']:
return False
# For words longer than one letter, filter out those that do not contain any vowels
if len(word) > 1 and not any(char in vowels for char in word):
return False
return True
# Split the cleaned text into words, filter them, and reassemble
words = story_without_numbers.split()
filtered_words = [word for word in words if is_valid_word(word)]
# Trim the cleaned story to the first 100 words (optional)
clean_story = " ".join(filtered_words[:100])
return clean_story
def get_caption(image) -> str:
"""
Takes an image and returns a generated caption.
"""
pil_image = Image.open(image)
caption = captioner(pil_image)[0]["generated_text"]
st.write("**๐ What's in the picture: ๐**")
st.write(caption)
return caption
def get_story(caption: str) -> str:
"""
Takes a caption and returns a funny, bright, and playful story targeted toward young children.
"""
prompt = (
f"Write a funny and playful story for young children precisely centered on this scene {caption}\nStory: "
f"mention the exact place, location or venue within {caption}. "
f"Make the story magical and exciting."
)
raw = storyer(
prompt,
max_new_tokens=150,
temperature=0.7,
top_p=0.9,
no_repeat_ngram_size=2,
return_full_text=False
)[0]["generated_text"].strip()
story = clean_generated_story(raw)
st.write("**๐ Your funny story: ๐**")
st.write(story)
return story
def generate_audio(story: str) -> str:
"""
Converts the text story into speech audio and returns the file path for the audio.
"""
chunks = textwrap.wrap(story, width=200)
audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
# Save the audio to a temporary file and return its path.
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
temp_file_path = temp_file.name
return temp_file_path
def generate_content(image):
"""
Pipeline function that:
- Generates a caption from the uploaded image.
- Uses the caption to generate a story.
- Converts the story to speech audio.
"""
caption = get_caption(image)
story = get_story(caption)
audio_path = generate_audio(story)
return caption, story, audio_path
# Streamlit UI section
st.title("โจ Magic Story Maker โจ")
st.markdown("Upload a picture to make a funny story and hear it too! ๐ธ")
uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])
if uploaded_image is None:
st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here! ๐ท", use_container_width=True)
else:
st.image(uploaded_image, caption="Your Picture ๐", use_container_width=True)
if st.button("โจ Make My Story! โจ"):
if uploaded_image is not None:
with st.spinner("๐ฎ Creating your magical story..."):
caption, story, audio_path = generate_content(uploaded_image)
st.success("๐ Your story is ready! ๐")
st.audio(audio_path, format="audio/wav")
os.remove(audio_path)
else:
st.warning("Please upload a picture first! ๐ธ") |