szeandlinkProject_Testing

Sleeping

App Files Files Community

Szeyu commited on Apr 30

Commit

e1351c4

verified ·

1 Parent(s): b540ff3

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -14

app.py CHANGED Viewed

@@ -37,6 +37,7 @@ st.markdown(
 def load_models():
     """
     Lazy-load the required pipelines and store them in session state.
     Pipelines:
       1. Captioner: Generates descriptive text from an image using a lighter model.
       2. Storyer: Generates a humorous children's story using aspis/gpt2-genre-story-generation.
@@ -62,11 +63,10 @@ def load_models():
 @st.cache_data(show_spinner=False)
 def get_caption(image_bytes):
     """
-    Converts image bytes into a lower resolution image (256x256 maximum)
     and generates a caption.
     """
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # Resize image to 256x256 maximum for faster processing
     image.thumbnail((256, 256))
     caption = st.session_state.captioner(image)[0]["generated_text"]
     return caption
@@ -76,46 +76,51 @@ def get_story(caption):
     """
     Generates a humorous and engaging children's story based on the caption.
     Uses a prompt to instruct the model and limits token generation to 80 tokens.
     """
     prompt = (
         f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
         f"{caption}\nStory: in third-person narrative, as if the author is playfully describing the scene in the image."
     )
-    raw_story = st.session_state.storyer(
         prompt,
         max_new_tokens=80,
         do_sample=True,
         temperature=0.7,
         top_p=0.9,
         return_full_text=False
-    )[0]["generated_text"].strip()
     words = raw_story.split()
-    return " ".join(words[:100])
 @st.cache_data(show_spinner=False)
 def get_audio(story):
     """
     Converts the generated story text into audio.
-    Splits the text into 300-character chunks to reduce repeated TTS calls.
-    Checks each chunk, and if no valid audio is produced, creates a brief default silent audio.
     """
     chunks = textwrap.wrap(story, width=300)
     audio_chunks = []
     for chunk in chunks:
         try:
             output = st.session_state.tts(chunk)
-            # Some pipelines return a list; if so, use the first element.
             if isinstance(output, list):
                 output = output[0]
             if "audio" in output:
-                # Ensure the audio is a numpy array and squeeze any extra dimensions.
                 audio_array = np.array(output["audio"]).squeeze()
                 audio_chunks.append(audio_array)
-        except Exception as e:
-            # Skip any chunk that raises an error.
             continue
-    # If no audio was generated, produce 1 second of silence as a fallback.
     if not audio_chunks:
         sr = st.session_state.tts.model.config.sampling_rate
         audio = np.zeros(sr, dtype=np.float32)
@@ -133,7 +138,6 @@ if uploaded_file is not None:
     try:
         load_models()  # Ensure models are loaded
         image_bytes = uploaded_file.getvalue()
-        # Display the uploaded image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         st.image(image, caption="Your Amazing Picture!", use_column_width=True)
         st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)
@@ -147,7 +151,11 @@ if uploaded_file is not None:
             with st.spinner("Generating story..."):
                 story = get_story(caption)
             st.markdown("<h3 style='text-align: center;'>Your Story:</h3>", unsafe_allow_html=True)
-            st.write(story)
             with st.spinner("Generating audio..."):
                 audio_buffer = get_audio(story)

 def load_models():
     """
     Lazy-load the required pipelines and store them in session state.
     Pipelines:
       1. Captioner: Generates descriptive text from an image using a lighter model.
       2. Storyer: Generates a humorous children's story using aspis/gpt2-genre-story-generation.
 @st.cache_data(show_spinner=False)
 def get_caption(image_bytes):
     """
+    Converts image bytes into a lower resolution image (maximum 256x256)
     and generates a caption.
     """
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     image.thumbnail((256, 256))
     caption = st.session_state.captioner(image)[0]["generated_text"]
     return caption
     """
     Generates a humorous and engaging children's story based on the caption.
     Uses a prompt to instruct the model and limits token generation to 80 tokens.
+    If no text is generated, a fallback story is returned.
     """
     prompt = (
         f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
         f"{caption}\nStory: in third-person narrative, as if the author is playfully describing the scene in the image."
     )
+    result = st.session_state.storyer(
         prompt,
         max_new_tokens=80,
         do_sample=True,
         temperature=0.7,
         top_p=0.9,
         return_full_text=False
+    )
+    # Log the raw result for debugging (viewable in server logs)
+    print("Story generation raw result:", result)
+    raw_story = result[0].get("generated_text", "").strip()
+    if not raw_story:
+        raw_story = "Once upon a time, the park was filled with laughter as children played happily under the bright sun."
     words = raw_story.split()
+    story = " ".join(words[:100])
+    return story
 @st.cache_data(show_spinner=False)
 def get_audio(story):
     """
     Converts the generated story text into audio.
+    Splits the text into 300-character chunks, processes each via the TTS pipeline,
+    and concatenates the resulting audio arrays. If no audio is generated, 1 second of silence is used.
     """
     chunks = textwrap.wrap(story, width=300)
     audio_chunks = []
     for chunk in chunks:
         try:
             output = st.session_state.tts(chunk)
             if isinstance(output, list):
                 output = output[0]
             if "audio" in output:
                 audio_array = np.array(output["audio"]).squeeze()
                 audio_chunks.append(audio_array)
+        except Exception:
             continue
     if not audio_chunks:
         sr = st.session_state.tts.model.config.sampling_rate
         audio = np.zeros(sr, dtype=np.float32)
     try:
         load_models()  # Ensure models are loaded
         image_bytes = uploaded_file.getvalue()
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         st.image(image, caption="Your Amazing Picture!", use_column_width=True)
         st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)
             with st.spinner("Generating story..."):
                 story = get_story(caption)
             st.markdown("<h3 style='text-align: center;'>Your Story:</h3>", unsafe_allow_html=True)
+            # If the story is empty (or consists only of whitespace), display a default message.
+            if not story.strip():
+                st.write("No story was generated. Please try again.")
+            else:
+                st.write(story)
             with st.spinner("Generating audio..."):
                 audio_buffer = get_audio(story)