Spaces:

justREE
/

Assignment1

Sleeping

App Files Files Community

justREE commited on May 2

Commit

f136dda

verified ·

1 Parent(s): 70a4e6d

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +95 -229

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,3 @@
-# app.py
 import io                   # for creating in-memory binary streams
 import wave                 # for writing WAV audio files
 import re                   # for regular expression utilities
@@ -8,303 +6,171 @@ from transformers import pipeline  # Hugging Face inference pipelines
 from PIL import Image       # Python Imaging Library for image loading
 import numpy as np          # numerical operations, especially array handling
-# 1) CACHE & LOAD MODELS
-# Use cache_resource for models/objects that should be loaded once per session/run
 @st.cache_resource(show_spinner=False)
 def load_captioner():
-    """Loads BLIP image-to-text model; cached so it loads only once."""
-    # Returns: a function captioner(image: PIL.Image) -> List[Dict]
-    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
-        device="cpu"
     )
 @st.cache_resource(show_spinner=False)
 def load_story_pipe():
-    """Loads FLAN-T5 text-to-text model for story generation; cached once."""
-    # Returns: a function story_pipe(prompt: str, **kwargs) -> List[Dict].
-    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "text2text-generation",
         model="google/flan-t5-base",
-        device="cpu"
     )
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
-    """Loads Meta MMS-TTS text-to-speech model; cached once."""
-    # Returns: a function tts_pipe(text: str) -> List[Dict] with "audio" and "sampling_rate".
-    # Using device="cpu" for broader compatibility. Change to "cuda" for GPU.
     return pipeline(
         "text-to-speech",
         model="facebook/mms-tts-eng",
-        device="cpu"
     )
 # 2) HELPER FUNCTIONS
 def sentence_case(text: str) -> str:
-    """
-    Splits text into sentences on .!? delimiters,
-    capitalizes the first character of each sentence,
-    then rejoins into a single string. Handles edge cases like leading/trailing spaces.
-    """
-    # Split while keeping the delimiters
     parts = re.split(r'([.!?])', text)
     out = []
-    # Iterate through parts, taking text followed by delimiter
     for i in range(0, len(parts) - 1, 2):
-        sentence = parts[i].strip() # Get the sentence text and remove surrounding whitespace
-        delimiter = parts[i + 1]    # Get the delimiter
-        if sentence: # Only process if there's actual text
-            # Capitalize the first letter of the cleaned sentence part
-            formatted_sentence = sentence[0].upper() + sentence[1:]
-            # Append the formatted sentence and its delimiter
-            out.append(f"{formatted_sentence}{delimiter}")
-        elif delimiter.strip(): # Handle cases where there's just a delimiter (e.g., "...")
-             out.append(delimiter)
-    # Handle any remaining part if the text didn't end with a delimiter
     if len(parts) % 2:
-        last_part = parts[-1].strip()
-        if last_part:
-             # Capitalize the first letter of the last part
-            formatted_last_part = last_part[0].upper() + last_part[1:]
-            out.append(formatted_last_part)
-    # Join parts and clean up potential excess spaces
-    # Join with a space first, then split and rejoin to handle multiple spaces
     return " ".join(" ".join(out).split())
 def caption_image(img: Image.Image, captioner) -> str:
-    """
-    Given a PIL image and a captioner pipeline, returns a single-line caption.
-    """
-    # Ensure image is in RGB format, as some models might expect it
     if img.mode != "RGB":
         img = img.convert("RGB")
-    results = captioner(img)  # run model
-    if not results:
-        return ""
-    # extract "generated_text" field from first result
-    return results[0].get("generated_text", "")
 def story_from_caption(caption: str, pipe) -> str:
-    """
-    Given a caption string and a text2text pipeline, returns a ~100-word story.
-    """
     if not caption:
         return "Could not generate a story without a caption."
-    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
-    # Add a directive for slightly more coherence
-    prompt += "\n\nWrite a creative and descriptive short story."
     results = pipe(
         prompt,
-        max_length=120,            # increased max length slightly
-        min_length=60,             # reduced min length slightly for robustness
-        do_sample=True,            # enable sampling for creativity
-        top_k=100,                 # sample from top_k tokens
-        top_p=0.9,                 # nucleus sampling threshold
-        temperature=0.8,           # slightly increased temperature for more randomness
-        repetition_penalty=1.1,    # discourage repetition
-        no_repeat_ngram_size=4,    # block repeated n-grams
         early_stopping=False
     )
-    raw = results[0]["generated_text"].strip()  # full generated text
-    # strip out the prompt if it echoes back - make comparison case-insensitive
-    # Check if the generated text starts with a substantial part of the prompt
-    prompt_check_length = min(len(prompt) // 2, 50) # Check against first half or 50 chars
-    if raw.lower().startswith(prompt.lower()[:prompt_check_length]):
-        # Attempt to remove the echoed prompt more robustly
-        raw = re.sub(re.escape(prompt), '', raw, count=1, flags=re.IGNORECASE).strip()
-    # trim to last complete sentence ending in . ! or ?
-    # Search for the first punctuation from the end of the string
-    match = re.search(r'[.!?]', raw[::-1])
-    if match:
-        # Trim the string at the position of the found punctuation
-        raw = raw[:len(raw) - match.start()]
-    elif len(raw) > 80: # If no punctuation found and story is long, trim and add ellipsis
-         raw = raw[:raw.rfind(' ') if raw.rfind(' ') != -1 and raw.rfind(' ') > 60 else 80] + "..."
-    elif len(raw) < 20: # If the story is very short and has no punctuation
-         raw += "..." # Add ellipsis to indicate it might be incomplete
     return sentence_case(raw)
 def tts_bytes(text: str, tts_pipe) -> bytes:
-    """
-    Given a text string and a tts pipeline, returns WAV-format bytes.
-    Cleans text for better TTS performance and handles audio data conversion.
-    """
     if not text:
-         return b"" # Return empty bytes if no text
-    # Clean up text for TTS - remove leading/trailing quotes, extra whitespace
-    cleaned_text = re.sub(r'^["\']|["\']$', '', text).strip()
-    # Replace multiple periods, handle ellipsis character
-    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
-    cleaned_text = cleaned_text.replace('…', '...')
-    # Ensure text ends with punctuation for better natural speech flow
-    if cleaned_text and cleaned_text[-1] not in '.!?':
-         cleaned_text += '.'
-    # Remove excessive internal whitespace
-    cleaned_text = " ".join(cleaned_text.split())
-    if not cleaned_text:
-        return b"" # Return empty bytes if cleaning results in empty string
-    output = tts_pipe(cleaned_text)
-    # pipeline may return list or single dict
     result = output[0] if isinstance(output, list) else output
-    audio_array = result.get("audio") # numpy array: (channels, samples) or (samples,)
-    rate = result.get("sampling_rate") # sampling rate integer
     if audio_array is None or rate is None:
-        st.error("TTS pipeline did not return expected audio data.")
         return b""
-    # ensure audio_array is 2D (samples, channels) for consistent handling
     if audio_array.ndim == 1:
-        data = audio_array[:, np.newaxis] # add channel dimension
     else:
-        data = audio_array.T # transpose from (channels, samples) to (samples, channels)
-    # convert float32 [-1..1] to int16 PCM [-32768..32767]
     pcm = (data * 32767).astype(np.int16)
-    buffer = io.BytesIO()
-    wf = wave.open(buffer, "wb")
-    try:
-        wf.setnchannels(data.shape[1] if data.ndim == 2 else 1) # set number of channels
-        wf.setsampwidth(2)                     # 16 bits = 2 bytes
-        wf.setframerate(rate)                  # samples per second
-        wf.writeframes(pcm.tobytes())          # write PCM data
-    finally:
-        wf.close() # Ensure the wave file object is closed
-    buffer.seek(0)
-    return buffer.read()                   # return raw WAV bytes
 # 3) STREAMLIT USER INTERFACE
-# --- Page Config ---
-st.set_page_config(page_title="Imagine & Narrate", page_icon="✨", layout="centered")
-# --- Title and Intro ---
-st.title("✨ Imagine & Narrate")
-st.write("Upload any image below to see AI imagine and narrate a story about it!")
-# --- File Uploader ---
-uploaded = st.file_uploader(
     "Choose an image file",
-    type=["jpg", "jpeg", "png"] # Specify allowed types
-    # Add an optional help text
-    # help="Supported formats: JPG, JPEG, PNG."
 )
-# --- Handle No Upload ---
-if not uploaded:
     st.info("➡️ Upload an image above to start the magic!")
-    st.stop() # Halt execution until file is uploaded
-# --- Image Loading ---
-# Use st.status for a nicer progress/status display during potentially slow steps
-with st.status("Loading image...", expanded=True) as status:
-    try:
-        status.update(label="Opening image file...", state="running")
-        img = Image.open(uploaded)
-        status.update(label="Image loaded successfully!", state="complete", expanded=False)
-    except Exception as e:
-        status.update(label=f"Error loading image: {e}", state="error")
-        st.error(f"Could not load the image. Please try a different file. Error: {e}")
-        st.stop() # Stop if image loading fails
-# --- Display Image ---
 st.subheader("📸 Your Visual Input")
-st.image(img, use_container_width=True, caption=uploaded.name) # Add caption with filename
 st.divider()
-# --- Step 2: Generate Caption ---
-st.subheader("🧠 Generating Insights")
-# Using st.status again for the pipeline steps
-with st.status("Scanning image for key elements…", expanded=True) as status:
-    try:
-        status.update(label="Running image captioning model...", state="running")
-        captioner = load_captioner()
-        raw_caption = caption_image(img, captioner)
-        if not raw_caption:
-             status.update(label="Image analysis failed.", state="error")
-             st.warning("Could not generate a caption for the image.")
-             st.stop()
-        caption = sentence_case(raw_caption)
-        status.update(label="Image analyzed, caption generated!", state="complete", expanded=False)
-    except Exception as e:
-        status.update(label=f"Error during image analysis: {e}", state="error")
-        st.error(f"An error occurred during image analysis: {e}")
         st.stop()
 st.markdown(f"**Identified Scene:** {caption}")
 st.divider()
-# --- Step 3: Generate Story ---
-st.subheader("📖 Crafting a Narrative")
-with st.status("Writing a compelling story…", expanded=True) as status:
-    try:
-        status.update(label="Running story generation model...", state="running")
-        story_pipe = load_story_pipe()
-        story = story_from_caption(caption, story_pipe)
-        if not story or story.strip() in ['.', '..', '...']: # Check for empty or minimal story
-             status.update(label="Story generation failed.", state="error")
-             st.warning("Could not generate a meaningful story from the caption.")
-             st.stop()
-        status.update(label="Story crafted!", state="complete", expanded=False)
-    except Exception as e:
-        status.update(label=f"Error during story generation: {e}", state="error")
-        st.error(f"An error occurred during story generation: {e}")
         st.stop()
 st.write(story)
 st.divider()
-# --- Step 4: Synthesize Audio ---
 st.subheader("👂 Hear the Story")
-with st.status("Synthesizing audio narration…", expanded=True) as status:
-    try:
-        status.update(label="Running text-to-speech model...", state="running")
-        tts_pipe = load_tts_pipe()
-        audio_bytes = tts_bytes(story, tts_pipe)
-        if not audio_bytes:
-            status.update(label="Audio generation failed.", state="error")
-            st.warning("Could not generate audio for the story.")
-        else:
-            status.update(label="Audio generated!", state="complete", expanded=False)
-            st.audio(audio_bytes, format="audio/wav")
-    except Exception as e:
-        status.update(label=f"Error during audio synthesis: {e}", state="error")
-        st.error(f"An error occurred during audio synthesis: {e}")
-# --- Celebration ---
 st.balloons()

 import io                   # for creating in-memory binary streams
 import wave                 # for writing WAV audio files
 import re                   # for regular expression utilities
 from PIL import Image       # Python Imaging Library for image loading
 import numpy as np          # numerical operations, especially array handling
+# 1) CACHE & LOAD MODELS (CPU only)
 @st.cache_resource(show_spinner=False)
 def load_captioner():
     return pipeline(
         "image-to-text",
         model="Salesforce/blip-image-captioning-base",
+        device=-1  # force CPU
     )
 @st.cache_resource(show_spinner=False)
 def load_story_pipe():
     return pipeline(
         "text2text-generation",
         model="google/flan-t5-base",
+        device=-1  # force CPU
     )
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
     return pipeline(
         "text-to-speech",
         model="facebook/mms-tts-eng",
+        device=-1  # force CPU
     )
 # 2) HELPER FUNCTIONS
 def sentence_case(text: str) -> str:
     parts = re.split(r'([.!?])', text)
     out = []
     for i in range(0, len(parts) - 1, 2):
+        sentence = parts[i].strip()
+        delimiter = parts[i + 1]
+        if sentence:
+            formatted = sentence[0].upper() + sentence[1:]
+            out.append(f"{formatted}{delimiter}")
     if len(parts) % 2:
+        last = parts[-1].strip()
+        if last:
+            formatted = last[0].upper() + last[1:]
+            out.append(formatted)
     return " ".join(" ".join(out).split())
 def caption_image(img: Image.Image, captioner) -> str:
     if img.mode != "RGB":
         img = img.convert("RGB")
+    results = captioner(img)
+    return (results[0].get("generated_text", "") if results else "")
 def story_from_caption(caption: str, pipe) -> str:
     if not caption:
         return "Could not generate a story without a caption."
+    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}\n\nWrite a creative and descriptive short story."
     results = pipe(
         prompt,
+        max_length=120,
+        min_length=60,
+        do_sample=True,
+        top_k=100,
+        top_p=0.9,
+        temperature=0.8,
+        repetition_penalty=1.1,
+        no_repeat_ngram_size=4,
         early_stopping=False
     )
+    raw = results[0]["generated_text"].strip()
+    # Remove prompt echo if present
+    raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
+    # Trim to last full sentence
+    idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
+    if idx != -1:
+        raw = raw[:idx+1]
+    elif len(raw) > 80:
+        raw = raw[:raw.rfind(" ") if raw.rfind(" ") > 60 else 80] + "..."
     return sentence_case(raw)
 def tts_bytes(text: str, tts_pipe) -> bytes:
     if not text:
+        return b""
+    cleaned = re.sub(r'^["\']|["\']$', '', text).strip()
+    cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
+    if cleaned[-1] not in ".!?":
+        cleaned += "."
+    cleaned = " ".join(cleaned.split())
+    output = tts_pipe(cleaned)
     result = output[0] if isinstance(output, list) else output
+    audio_array = result.get("audio")
+    rate = result.get("sampling_rate")
     if audio_array is None or rate is None:
         return b""
     if audio_array.ndim == 1:
+        data = audio_array[:, np.newaxis]
     else:
+        data = audio_array.T
     pcm = (data * 32767).astype(np.int16)
+    buf = io.BytesIO()
+    wf = wave.open(buf, "wb")
+    wf.setnchannels(data.shape[1])
+    wf.setsampwidth(2)
+    wf.setframerate(rate)
+    wf.writeframes(pcm.tobytes())
+    wf.close()
+    buf.seek(0)
+    return buf.read()
 # 3) STREAMLIT USER INTERFACE
+st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")
+# Persist upload across reruns
+if "uploaded_file" not in st.session_state:
+    st.session_state.uploaded_file = None
+new_upload = st.file_uploader(
     "Choose an image file",
+    type=["jpg", "jpeg", "png"]
 )
+if new_upload is not None:
+    st.session_state.uploaded_file = new_upload
+if st.session_state.uploaded_file is None:
+    st.title("✨ Imagine & Narrate")
     st.info("➡️ Upload an image above to start the magic!")
+    st.stop()
+uploaded = st.session_state.uploaded_file
+try:
+    img = Image.open(uploaded)
+except Exception as e:
+    st.error(f"Could not load the image: {e}")
+    st.stop()
+st.title("✨ Imagine & Narrate")
 st.subheader("📸 Your Visual Input")
+st.image(img, caption=uploaded.name, use_container_width=True)
 st.divider()
+# Step 1: Generate Caption
+st.subheader("🧠 Generating Caption")
+with st.spinner("Analyzing image..."):
+    captioner = load_captioner()
+    raw_caption = caption_image(img, captioner)
+    if not raw_caption:
+        st.error("Failed to generate caption.")
         st.stop()
+    caption = sentence_case(raw_caption)
 st.markdown(f"**Identified Scene:** {caption}")
 st.divider()
+# Step 2: Generate Story
+st.subheader("📖 Crafting a Story")
+with st.spinner("Writing story..."):
+    story_pipe = load_story_pipe()
+    story = story_from_caption(caption, story_pipe)
+    if not story or story.strip() in {".", "..", "..."}:
+        st.error("Failed to generate story.")
         st.stop()
 st.write(story)
 st.divider()
+# Step 3: Synthesize Audio
 st.subheader("👂 Hear the Story")
+with st.spinner("Synthesizing audio..."):
+    tts_pipe = load_tts_pipe()
+    audio_bytes = tts_bytes(story, tts_pipe)
+    if not audio_bytes:
+        st.warning("Audio generation failed.")
+    else:
+        st.audio(audio_bytes, format="audio/wav")
 st.balloons()