File size: 4,832 Bytes
bc7ea9a
29e42d5
 
 
 
 
b9ea623
29e42d5
37631cc
f22469a
29e42d5
f22469a
29e42d5
e35a81f
df08c46
e35a81f
 
 
29e42d5
e35a81f
29e42d5
bc7ea9a
 
 
890cd41
 
 
 
bc7ea9a
890cd41
 
 
 
 
 
 
bc7ea9a
 
 
 
 
 
 
 
 
 
 
 
890cd41
bc7ea9a
 
 
890cd41
bc7ea9a
 
 
4cf543b
 
 
 
e35a81f
 
822643b
 
4cf543b
29e42d5
4cf543b
 
 
 
e35a81f
890cd41
4cf543b
04471ed
29e42d5
bc7ea9a
e35a81f
 
e9ee97c
019caa7
 
 
29e42d5
 
f22469a
bc7ea9a
822643b
 
4cf543b
29e42d5
4cf543b
 
890cd41
4cf543b
e35a81f
 
4cf543b
890cd41
e35a81f
 
 
4cf543b
29e42d5
4cf543b
 
890cd41
 
 
 
4cf543b
 
 
 
 
29e42d5
bc7ea9a
822643b
 
29e42d5
f22469a
822643b
 
8b6a9db
822643b
8b6a9db
59b54a5
 
 
822643b
e35a81f
822643b
e35a81f
59b54a5
 
890cd41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import re
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
import tempfile
import os
from PIL import Image
import string

# Initialize pipelines with caching
@st.cache_resource
def load_pipelines():
    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
    tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    return captioner, storyer, tts

captioner, storyer, tts = load_pipelines()

def clean_generated_story(raw_story: str) -> str:
    """
    Cleans the generated story by:
    1. Removing URLs.
    2. Removing digits.
    3. Removing words likely to be random letter combinations based on having no vowels.
    4. Removing single-letter words unless allowed (such as 'a' or 'I').
    """
    # Remove URLs starting with http://, https://, or www.
    no_urls = re.sub(r'\b(?:https?://|www\.)\S+\b', '', raw_story)
    # Remove domain names without protocol (e.g., erskybooks.com)
    no_urls = re.sub(r'\b\w+\.(com|net|org|co\.uk|ca\.us|me)\b', '', no_urls)
    
    # Remove all digits
    story_without_numbers = re.sub(r'\d+', '', no_urls)
    
    vowels = set('aeiouAEIOU')
    
    def is_valid_word(word: str) -> bool:
        # Allow "a" and "I" for single-letter words
        if len(word) == 1 and word.lower() not in ['a', 'i']:
            return False
        # For words longer than one letter, filter out those that do not contain any vowels
        if len(word) > 1 and not any(char in vowels for char in word):
            return False
        return True

    # Split the cleaned text into words, filter them, and reassemble
    words = story_without_numbers.split()
    filtered_words = [word for word in words if is_valid_word(word)]
    
    # Trim the cleaned story to the first 100 words (optional)
    clean_story = " ".join(filtered_words[:100])
    return clean_story

def get_caption(image) -> str:
    """
    Takes an image and returns a generated caption.
    """
    pil_image = Image.open(image)
    caption = captioner(pil_image)[0]["generated_text"]
    st.write("**๐ŸŒŸ What's in the picture: ๐ŸŒŸ**")
    st.write(caption)
    return caption

def get_story(caption: str) -> str:
    """
    Takes a caption and returns a funny, bright, and playful story targeted toward young children.
    """
    prompt = (
        f"Write a funny and playful story for young children precisely centered on this scene {caption}\nStory: "
        f"mention the exact place, location or venue within {caption}. "
        f"Make the story magical and exciting."
    )
    
    raw = storyer(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        no_repeat_ngram_size=2,
        return_full_text=False
    )[0]["generated_text"].strip()
    
    story = clean_generated_story(raw)
    st.write("**๐Ÿ“– Your funny story: ๐Ÿ“–**")
    st.write(story)
    return story

def generate_audio(story: str) -> str:
    """
    Converts the text story into speech audio and returns the file path for the audio.
    """
    chunks = textwrap.wrap(story, width=200)
    audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])
    
    # Save the audio to a temporary file and return its path.
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
        temp_file_path = temp_file.name
    return temp_file_path

def generate_content(image):
    """
    Pipeline function that:
    - Generates a caption from the uploaded image.
    - Uses the caption to generate a story.
    - Converts the story to speech audio.
    """
    caption = get_caption(image)
    story = get_story(caption)
    audio_path = generate_audio(story)
    return caption, story, audio_path

# Streamlit UI section
st.title("โœจ Magic Story Maker โœจ")
st.markdown("Upload a picture to make a funny story and hear it too! ๐Ÿ“ธ")

uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])

if uploaded_image is None:
    st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here! ๐Ÿ“ท", use_container_width=True)
else:
    st.image(uploaded_image, caption="Your Picture ๐ŸŒŸ", use_container_width=True)

if st.button("โœจ Make My Story! โœจ"):
    if uploaded_image is not None:
        with st.spinner("๐Ÿ”ฎ Creating your magical story..."):
            caption, story, audio_path = generate_content(uploaded_image)
            st.success("๐ŸŽ‰ Your story is ready! ๐ŸŽ‰")
            st.audio(audio_path, format="audio/wav")
            os.remove(audio_path)
    else:
        st.warning("Please upload a picture first! ๐Ÿ“ธ")