Spaces:

justREE
/

Assignment1

Sleeping

App Files Files Community

justREE commited on May 1

Commit

70a4e6d

verified ·

1 Parent(s): 321b768

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +187 -85

src/streamlit_app.py CHANGED Viewed

@@ -9,68 +9,85 @@ from PIL import Image       # Python Imaging Library for image loading
 import numpy as np          # numerical operations, especially array handling
 # 1) CACHE & LOAD MODELS
 @st.cache_resource(show_spinner=False)
 def load_captioner():
-    # Loads BLIP image-to-text model; cached so it loads only once.
-    # Returns: a function captioner(image: PIL.Image) -> List[Dict],
     return pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
-        device="cpu" # Can change to "cuda" if GPU is available
     )
 @st.cache_resource(show_spinner=False)
 def load_story_pipe():
-    # Loads FLAN-T5 text-to-text model for story generation; cached once.
     # Returns: a function story_pipe(prompt: str, **kwargs) -> List[Dict].
     return pipeline(
         "text2text-generation",
         model="google/flan-t5-base",
-        device="cpu" # Can change to "cuda" if GPU is available
     )
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
-    # Loads Meta MMS-TTS text-to-speech model; cached once.
     # Returns: a function tts_pipe(text: str) -> List[Dict] with "audio" and "sampling_rate".
     return pipeline(
         "text-to-speech",
         model="facebook/mms-tts-eng",
-        device="cpu" # Can change to "cuda" if GPU is available
     )
 # 2) HELPER FUNCTIONS
 def sentence_case(text: str) -> str:
-    # Splits text into sentences on .!? delimiters,
-    # capitalizes the first character of each sentence,
-    # then rejoins into a single string.
-    parts = re.split(r'([.!?])', text)  # ["hello", ".", " world", "!"]
     out = []
     for i in range(0, len(parts) - 1, 2):
-        sentence = parts[i].strip().capitalize()  # capitalize first letter
-        delimiter = parts[i + 1]                  # punctuation
-        # Ensure a space before the sentence if it wasn't the very first part
-        if out and not sentence.startswith(' ') and out[-1][-1] not in '.!?':
-             out.append(f" {sentence}{delimiter}")
-        else:
-            out.append(f"{sentence}{delimiter}")
-    # If trailing text without punctuation exists, capitalize and append it.
     if len(parts) % 2:
-        last = parts[-1].strip().capitalize()
-        if last:
-             # Ensure a space before if needed
-             if out and not last.startswith(' ') and out[-1][-1] not in '.!?':
-                 out.append(f" {last}")
-             else:
-                 out.append(last)
-    # Clean up potential multiple spaces resulting from split/join
     return " ".join(" ".join(out).split())
 def caption_image(img: Image.Image, captioner) -> str:
-    # Given a PIL image and a captioner pipeline, returns a single-line caption.
     results = captioner(img)  # run model
     if not results:
         return ""
@@ -78,50 +95,86 @@ def caption_image(img: Image.Image, captioner) -> str:
     return results[0].get("generated_text", "")
 def story_from_caption(caption: str, pipe) -> str:
-    # Given a caption string and a text2text pipeline, returns a ~100-word story.
     prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
     results = pipe(
         prompt,
         max_length=120,            # increased max length slightly
-        min_length=80,             # minimum generated tokens
-        do_sample=True,            # enable sampling
         top_k=100,                 # sample from top_k tokens
         top_p=0.9,                 # nucleus sampling threshold
-        temperature=0.7,           # sampling temperature
         repetition_penalty=1.1,    # discourage repetition
         no_repeat_ngram_size=4,    # block repeated n-grams
         early_stopping=False
     )
     raw = results[0]["generated_text"].strip()  # full generated text
     # strip out the prompt if it echoes back - make comparison case-insensitive
-    if raw.lower().startswith(prompt.lower()):
-        raw = raw[len(prompt):].strip()
     # trim to last complete sentence ending in . ! or ?
-    match = re.search(r'[.!?]', raw[::-1]) # Search for the first punctuation from the end
     if match:
-        raw = raw[:len(raw) - match.start()] # Trim at that position
-    elif len(raw) > 80: # If no punctuation found but story is long, trim to a reasonable length
-         raw = raw[:80] + "..."
     return sentence_case(raw)
 def tts_bytes(text: str, tts_pipe) -> bytes:
-    # Given a text string and a tts pipeline, returns WAV-format bytes.
-    # Clean up text for TTS - remove leading/trailing quotes, etc.
     cleaned_text = re.sub(r'^["\']|["\']$', '', text).strip()
-    # Basic punctuation cleaning (optional, depending on TTS model)
-    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) # Replace multiple periods with one
-    cleaned_text = cleaned_text.replace('…', '...') # Replace ellipsis char with dots
-    # Add a period if the text doesn't end with punctuation (helps TTS model finalize)
     if cleaned_text and cleaned_text[-1] not in '.!?':
          cleaned_text += '.'
     output = tts_pipe(cleaned_text)
     # pipeline may return list or single dict
     result = output[0] if isinstance(output, list) else output
-    audio_array = result["audio"]            # numpy array: (channels, samples) or (samples,)
-    rate = result["sampling_rate"]           # sampling rate integer
     # ensure audio_array is 2D (samples, channels) for consistent handling
     if audio_array.ndim == 1:
@@ -135,74 +188,123 @@ def tts_bytes(text: str, tts_pipe) -> bytes:
     buffer = io.BytesIO()
     wf = wave.open(buffer, "wb")
-    wf.setnchannels(data.shape[1])         # number of channels
-    wf.setsampwidth(2)                     # 16 bits = 2 bytes
-    wf.setframerate(rate)                  # samples per second
-    wf.writeframes(pcm.tobytes())          # write PCM data
-    wf.close()
     buffer.seek(0)
     return buffer.read()                   # return raw WAV bytes
 # 3) STREAMLIT USER INTERFACE
 st.set_page_config(page_title="Imagine & Narrate", page_icon="✨", layout="centered")
 st.title("✨ Imagine & Narrate")
 st.write("Upload any image below to see AI imagine and narrate a story about it!")
-# -- Upload image widget --
 uploaded = st.file_uploader(
     "Choose an image file",
-    type=["jpg", "jpeg", "png"]
 )
 if not uploaded:
     st.info("➡️ Upload an image above to start the magic!")
-    st.stop()
-# Load the uploaded file into a PIL Image
-try:
-    img = Image.open(uploaded)
-except Exception as e:
-    st.error(f"Error loading image: {e}")
-    st.stop()
-# -- Step 1: Display the image --
 st.subheader("📸 Your Visual Input")
-st.image(img, use_container_width=True)
 st.divider()
-# -- Step 2: Generate and display caption --
 st.subheader("🧠 Generating Insights")
-with st.spinner("Scanning image for key elements…"):
-    captioner = load_captioner()
-    raw_caption = caption_image(img, captioner)
-    if not raw_caption:
-         st.warning("Could not generate a caption for the image.")
-         st.stop()
-    caption = sentence_case(raw_caption)
 st.markdown(f"**Identified Scene:** {caption}")
 st.divider()
-# -- Step 3: Generate and display story --
 st.subheader("📖 Crafting a Narrative")
-with st.spinner("Writing a compelling story…"):
-    story_pipe = load_story_pipe()
-    story = story_from_caption(caption, story_pipe)
-    if not story or story.strip() == '...': # Check for empty or minimal story
-         st.warning("Could not generate a meaningful story from the caption.")
-         st.stop()
 st.write(story)
 st.divider()
-# -- Step 4: Synthesize and play audio --
 st.subheader("👂 Hear the Story")
-with st.spinner("Synthesizing audio narration…"):
-    tts_pipe = load_tts_pipe()
     try:
         audio_bytes = tts_bytes(story, tts_pipe)
-        st.audio(audio_bytes, format="audio/wav")
     except Exception as e:
-        st.error(f"Error generating audio: {e}")
-# Celebration animation
 st.balloons()

 import numpy as np          # numerical operations, especially array handling
 # 1) CACHE & LOAD MODELS
+# Use cache_resource for models/objects that should be loaded once per session/run
 @st.cache_resource(show_spinner=False)
 def load_captioner():
+    """Loads BLIP image-to-text model; cached so it loads only once."""
+    # Returns: a function captioner(image: PIL.Image) -> List[Dict]
+    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
+        device="cpu"
     )
 @st.cache_resource(show_spinner=False)
 def load_story_pipe():
+    """Loads FLAN-T5 text-to-text model for story generation; cached once."""
     # Returns: a function story_pipe(prompt: str, **kwargs) -> List[Dict].
+    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "text2text-generation",
         model="google/flan-t5-base",
+        device="cpu"
     )
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
+    """Loads Meta MMS-TTS text-to-speech model; cached once."""
     # Returns: a function tts_pipe(text: str) -> List[Dict] with "audio" and "sampling_rate".
+    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "text-to-speech",
         model="facebook/mms-tts-eng",
+        device="cpu"
     )
 # 2) HELPER FUNCTIONS
 def sentence_case(text: str) -> str:
+    """
+    Splits text into sentences on .!? delimiters,
+    capitalizes the first character of each sentence,
+    then rejoins into a single string. Handles edge cases like leading/trailing spaces.
+    """
+    # Split while keeping the delimiters
+    parts = re.split(r'([.!?])', text)
     out = []
+    # Iterate through parts, taking text followed by delimiter
     for i in range(0, len(parts) - 1, 2):
+        sentence = parts[i].strip() # Get the sentence text and remove surrounding whitespace
+        delimiter = parts[i + 1]    # Get the delimiter
+        if sentence: # Only process if there's actual text
+            # Capitalize the first letter of the cleaned sentence part
+            formatted_sentence = sentence[0].upper() + sentence[1:]
+            # Append the formatted sentence and its delimiter
+            out.append(f"{formatted_sentence}{delimiter}")
+        elif delimiter.strip(): # Handle cases where there's just a delimiter (e.g., "...")
+             out.append(delimiter)
+    # Handle any remaining part if the text didn't end with a delimiter
     if len(parts) % 2:
+        last_part = parts[-1].strip()
+        if last_part:
+             # Capitalize the first letter of the last part
+            formatted_last_part = last_part[0].upper() + last_part[1:]
+            out.append(formatted_last_part)
+    # Join parts and clean up potential excess spaces
+    # Join with a space first, then split and rejoin to handle multiple spaces
     return " ".join(" ".join(out).split())
 def caption_image(img: Image.Image, captioner) -> str:
+    """
+    Given a PIL image and a captioner pipeline, returns a single-line caption.
+    """
+    # Ensure image is in RGB format, as some models might expect it
+    if img.mode != "RGB":
+        img = img.convert("RGB")
     results = captioner(img)  # run model
     if not results:
         return ""
     return results[0].get("generated_text", "")
 def story_from_caption(caption: str, pipe) -> str:
+    """
+    Given a caption string and a text2text pipeline, returns a ~100-word story.
+    """
+    if not caption:
+        return "Could not generate a story without a caption."
     prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
+    # Add a directive for slightly more coherence
+    prompt += "\n\nWrite a creative and descriptive short story."
     results = pipe(
         prompt,
         max_length=120,            # increased max length slightly
+        min_length=60,             # reduced min length slightly for robustness
+        do_sample=True,            # enable sampling for creativity
         top_k=100,                 # sample from top_k tokens
         top_p=0.9,                 # nucleus sampling threshold
+        temperature=0.8,           # slightly increased temperature for more randomness
         repetition_penalty=1.1,    # discourage repetition
         no_repeat_ngram_size=4,    # block repeated n-grams
         early_stopping=False
     )
     raw = results[0]["generated_text"].strip()  # full generated text
     # strip out the prompt if it echoes back - make comparison case-insensitive
+    # Check if the generated text starts with a substantial part of the prompt
+    prompt_check_length = min(len(prompt) // 2, 50) # Check against first half or 50 chars
+    if raw.lower().startswith(prompt.lower()[:prompt_check_length]):
+        # Attempt to remove the echoed prompt more robustly
+        raw = re.sub(re.escape(prompt), '', raw, count=1, flags=re.IGNORECASE).strip()
     # trim to last complete sentence ending in . ! or ?
+    # Search for the first punctuation from the end of the string
+    match = re.search(r'[.!?]', raw[::-1])
     if match:
+        # Trim the string at the position of the found punctuation
+        raw = raw[:len(raw) - match.start()]
+    elif len(raw) > 80: # If no punctuation found and story is long, trim and add ellipsis
+         raw = raw[:raw.rfind(' ') if raw.rfind(' ') != -1 and raw.rfind(' ') > 60 else 80] + "..."
+    elif len(raw) < 20: # If the story is very short and has no punctuation
+         raw += "..." # Add ellipsis to indicate it might be incomplete
     return sentence_case(raw)
 def tts_bytes(text: str, tts_pipe) -> bytes:
+    """
+    Given a text string and a tts pipeline, returns WAV-format bytes.
+    Cleans text for better TTS performance and handles audio data conversion.
+    """
+    if not text:
+         return b"" # Return empty bytes if no text
+    # Clean up text for TTS - remove leading/trailing quotes, extra whitespace
     cleaned_text = re.sub(r'^["\']|["\']$', '', text).strip()
+    # Replace multiple periods, handle ellipsis character
+    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
+    cleaned_text = cleaned_text.replace('…', '...')
+    # Ensure text ends with punctuation for better natural speech flow
     if cleaned_text and cleaned_text[-1] not in '.!?':
          cleaned_text += '.'
+    # Remove excessive internal whitespace
+    cleaned_text = " ".join(cleaned_text.split())
+    if not cleaned_text:
+        return b"" # Return empty bytes if cleaning results in empty string
     output = tts_pipe(cleaned_text)
     # pipeline may return list or single dict
     result = output[0] if isinstance(output, list) else output
+    audio_array = result.get("audio") # numpy array: (channels, samples) or (samples,)
+    rate = result.get("sampling_rate") # sampling rate integer
+    if audio_array is None or rate is None:
+        st.error("TTS pipeline did not return expected audio data.")
+        return b""
     # ensure audio_array is 2D (samples, channels) for consistent handling
     if audio_array.ndim == 1:
     buffer = io.BytesIO()
     wf = wave.open(buffer, "wb")
+    try:
+        wf.setnchannels(data.shape[1] if data.ndim == 2 else 1) # set number of channels
+        wf.setsampwidth(2)                     # 16 bits = 2 bytes
+        wf.setframerate(rate)                  # samples per second
+        wf.writeframes(pcm.tobytes())          # write PCM data
+    finally:
+        wf.close() # Ensure the wave file object is closed
     buffer.seek(0)
     return buffer.read()                   # return raw WAV bytes
 # 3) STREAMLIT USER INTERFACE
+# --- Page Config ---
 st.set_page_config(page_title="Imagine & Narrate", page_icon="✨", layout="centered")
+# --- Title and Intro ---
 st.title("✨ Imagine & Narrate")
 st.write("Upload any image below to see AI imagine and narrate a story about it!")
+# --- File Uploader ---
 uploaded = st.file_uploader(
     "Choose an image file",
+    type=["jpg", "jpeg", "png"] # Specify allowed types
+    # Add an optional help text
+    # help="Supported formats: JPG, JPEG, PNG."
 )
+# --- Handle No Upload ---
 if not uploaded:
     st.info("➡️ Upload an image above to start the magic!")
+    st.stop() # Halt execution until file is uploaded
+# --- Image Loading ---
+# Use st.status for a nicer progress/status display during potentially slow steps
+with st.status("Loading image...", expanded=True) as status:
+    try:
+        status.update(label="Opening image file...", state="running")
+        img = Image.open(uploaded)
+        status.update(label="Image loaded successfully!", state="complete", expanded=False)
+    except Exception as e:
+        status.update(label=f"Error loading image: {e}", state="error")
+        st.error(f"Could not load the image. Please try a different file. Error: {e}")
+        st.stop() # Stop if image loading fails
+# --- Display Image ---
 st.subheader("📸 Your Visual Input")
+st.image(img, use_container_width=True, caption=uploaded.name) # Add caption with filename
 st.divider()
+# --- Step 2: Generate Caption ---
 st.subheader("🧠 Generating Insights")
+# Using st.status again for the pipeline steps
+with st.status("Scanning image for key elements…", expanded=True) as status:
+    try:
+        status.update(label="Running image captioning model...", state="running")
+        captioner = load_captioner()
+        raw_caption = caption_image(img, captioner)
+        if not raw_caption:
+             status.update(label="Image analysis failed.", state="error")
+             st.warning("Could not generate a caption for the image.")
+             st.stop()
+        caption = sentence_case(raw_caption)
+        status.update(label="Image analyzed, caption generated!", state="complete", expanded=False)
+    except Exception as e:
+        status.update(label=f"Error during image analysis: {e}", state="error")
+        st.error(f"An error occurred during image analysis: {e}")
+        st.stop()
 st.markdown(f"**Identified Scene:** {caption}")
 st.divider()
+# --- Step 3: Generate Story ---
 st.subheader("📖 Crafting a Narrative")
+with st.status("Writing a compelling story…", expanded=True) as status:
+    try:
+        status.update(label="Running story generation model...", state="running")
+        story_pipe = load_story_pipe()
+        story = story_from_caption(caption, story_pipe)
+        if not story or story.strip() in ['.', '..', '...']: # Check for empty or minimal story
+             status.update(label="Story generation failed.", state="error")
+             st.warning("Could not generate a meaningful story from the caption.")
+             st.stop()
+        status.update(label="Story crafted!", state="complete", expanded=False)
+    except Exception as e:
+        status.update(label=f"Error during story generation: {e}", state="error")
+        st.error(f"An error occurred during story generation: {e}")
+        st.stop()
 st.write(story)
 st.divider()
+# --- Step 4: Synthesize Audio ---
 st.subheader("👂 Hear the Story")
+with st.status("Synthesizing audio narration…", expanded=True) as status:
     try:
+        status.update(label="Running text-to-speech model...", state="running")
+        tts_pipe = load_tts_pipe()
         audio_bytes = tts_bytes(story, tts_pipe)
+        if not audio_bytes:
+            status.update(label="Audio generation failed.", state="error")
+            st.warning("Could not generate audio for the story.")
+        else:
+            status.update(label="Audio generated!", state="complete", expanded=False)
+            st.audio(audio_bytes, format="audio/wav")
     except Exception as e:
+        status.update(label=f"Error during audio synthesis: {e}", state="error")
+        st.error(f"An error occurred during audio synthesis: {e}")
+# --- Celebration ---
 st.balloons()