File size: 4,070 Bytes
4b14b3d
29e42d5
 
 
 
 
 
 
37631cc
f22469a
29e42d5
f22469a
29e42d5
e35a81f
f0a6b70
e35a81f
 
 
29e42d5
e35a81f
29e42d5
ccd016d
 
 
 
 
 
 
 
 
f22469a
e35a81f
 
37631cc
f22469a
e35a81f
822643b
 
29e42d5
ccd016d
 
 
 
 
e35a81f
29e42d5
ccd016d
 
 
29e42d5
f0a6b70
f22469a
e35a81f
 
29e42d5
e35a81f
 
 
29e42d5
 
 
f22469a
 
 
 
 
 
 
 
e35a81f
f22469a
822643b
 
29e42d5
f22469a
e35a81f
 
29e42d5
f22469a
e35a81f
 
 
29e42d5
e35a81f
29e42d5
f22469a
ccd016d
 
 
 
 
 
 
 
 
 
 
822643b
 
29e42d5
f22469a
822643b
 
f22469a
822643b
 
 
f22469a
822643b
 
e35a81f
822643b
e35a81f
822643b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# import part
import streamlit as st
from transformers import pipeline
import textwrap
import numpy as np
import soundfile as sf
import tempfile
import os
from PIL import Image
import string

# Initialize pipelines with caching
@st.cache_resource
def load_pipelines():
    captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
    storyer = pipeline("text-generation", model="aspis/gpt2-genre-story-generation")
    tts = pipeline("text-to-speech", model="facebook/mms-tts-eng")
    return captioner, storyer, tts

captioner, storyer, tts = load_pipelines()

# Function to extract keywords from caption
def extract_keywords(caption):
    # Simple keyword extraction: split words and filter out common stop words
    stop_words = {'the', 'a', 'an', 'in', 'on', 'at', 'of', 'to', 'is', 'are', 'with', 'and'}
    words = caption.lower().split()
    # Keep words longer than 2 characters and not in stop words
    keywords = [word.strip(".,!?\"'") for word in words if word not in stop_words and len(word) > 2]
    return keywords

# Function to generate content from an image
def generate_content(image):
    pil_image = Image.open(image)
    
    # Generate caption
    caption = captioner(pil_image)[0]["generated_text"]
    st.write("**๐ŸŒŸ What's in the picture: ๐ŸŒŸ**")
    st.write(caption)

    # Extract keywords from the caption
    keywords = extract_keywords(caption)
    keywords_str = ", ".join(keywords)

    # Create prompt for story, ensuring keywords are included
    prompt = (
        f"Write a funny, warm children's story for ages 3-10, 50โ€“100 words, "
        f"in third-person narrative, that describes this scene exactly: {caption}. "
        f"Explicitly include these keywords from the caption in the story: {keywords_str}. "
        f"Mention the exact place, location, or venue within the scene, such as a park, pool, or gym."
    )
    
    # Generate raw story
    raw = storyer(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        no_repeat_ngram_size=2,
        return_full_text=False
    )[0]["generated_text"].strip()

    # Define allowed characters to keep (removes symbols like * and ~)
    allowed_chars = string.ascii_letters + string.digits + " .,!?\"'-"
    
    # Clean the raw story by keeping only allowed characters
    clean_raw = ''.join(c for c in raw if c in allowed_chars)
    
    # Split into words and trim to 100 words
    words = clean_raw.split()
    story = " ".join(words[:100])
    
    st.write("**๐Ÿ“– Your funny story: ๐Ÿ“–**")
    st.write(story)

    # Generate audio from cleaned story
    chunks = textwrap.wrap(story, width=200)
    audio = np.concatenate([tts(chunk)["audio"].squeeze() for chunk in chunks])

    # Save audio to temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        sf.write(temp_file.name, audio, tts.model.config.sampling_rate)
        temp_file_path = temp_file.name

    return caption, story, temp_file_path

# Streamlit UI
st.markdown(
    """
    <style>
    .stApp {
        background: radial-gradient(circle, #e6f3ff, #e6fff2);
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("โœจ Magic Story Maker โœจ")
st.markdown("Upload a picture to make a funny story and hear it too! ๐Ÿ“ธ")

uploaded_image = st.file_uploader("Choose your picture", type=["jpg", "jpeg", "png"])

if uploaded_image is None:
    st.image("https://example.com/placeholder_image.jpg", caption="Upload your picture here! ๐Ÿ“ท", use_column_width=True)
else:
    st.image(uploaded_image, caption="Your Picture ๐ŸŒŸ", use_column_width=True)

if st.button("โœจ Make My Story! โœจ"):
    if uploaded_image is not None:
        with st.spinner("๐Ÿ”ฎ Creating your magical story..."):
            caption, story, audio_path = generate_content(uploaded_image)
            st.success("๐ŸŽ‰ Your story is ready! ๐ŸŽ‰")
            st.audio(audio_path, format="audio/wav")
            os.remove(audio_path)
    else:
        st.warning("Please upload a picture first! ๐Ÿ“ธ")