Spaces:

justREE
/

Assignment1

Sleeping

File size: 8,317 Bytes

77c4802

# app.py

import io                   # for creating in-memory binary streams
import wave                 # for writing WAV audio files
import re                   # for regular expression utilities
import streamlit as st      # Streamlit UI library
from transformers import pipeline  # Hugging Face inference pipelines
from PIL import Image       # Python Imaging Library for image loading
import numpy as np          # numerical operations, especially array handling

# 1) CACHE & LOAD MODELS
@st.cache_resource(show_spinner=False)
def load_captioner():
    # Loads BLIP image-to-text model; cached so it loads only once.
    # Returns: a function captioner(image: PIL.Image) -> List[Dict],
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device="cpu" # Can change to "cuda" if GPU is available
    )

@st.cache_resource(show_spinner=False)
def load_story_pipe():
    # Loads FLAN-T5 text-to-text model for story generation; cached once.
    # Returns: a function story_pipe(prompt: str, **kwargs) -> List[Dict].
    return pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        device="cpu" # Can change to "cuda" if GPU is available
    )

@st.cache_resource(show_spinner=False)
def load_tts_pipe():
    # Loads Meta MMS-TTS text-to-speech model; cached once.
    # Returns: a function tts_pipe(text: str) -> List[Dict] with "audio" and "sampling_rate".
    return pipeline(
        "text-to-speech",
        model="facebook/mms-tts-eng",
        device="cpu" # Can change to "cuda" if GPU is available
    )

# 2) HELPER FUNCTIONS
def sentence_case(text: str) -> str:
    # Splits text into sentences on .!? delimiters,
    # capitalizes the first character of each sentence,
    # then rejoins into a single string.
    parts = re.split(r'([.!?])', text)  # ["hello", ".", " world", "!"]
    out = []
    for i in range(0, len(parts) - 1, 2):
        sentence = parts[i].strip().capitalize()  # capitalize first letter
        delimiter = parts[i + 1]                  # punctuation
        # Ensure a space before the sentence if it wasn't the very first part
        if out and not sentence.startswith(' ') and out[-1][-1] not in '.!?':
             out.append(f" {sentence}{delimiter}")
        else:
            out.append(f"{sentence}{delimiter}")

    # If trailing text without punctuation exists, capitalize and append it.
    if len(parts) % 2:
        last = parts[-1].strip().capitalize()
        if last:
             # Ensure a space before if needed
             if out and not last.startswith(' ') and out[-1][-1] not in '.!?':
                 out.append(f" {last}")
             else:
                 out.append(last)

    # Clean up potential multiple spaces resulting from split/join
    return " ".join(" ".join(out).split())


def caption_image(img: Image.Image, captioner) -> str:
    # Given a PIL image and a captioner pipeline, returns a single-line caption.
    results = captioner(img)  # run model
    if not results:
        return ""
    # extract "generated_text" field from first result
    return results[0].get("generated_text", "")

def story_from_caption(caption: str, pipe) -> str:
    # Given a caption string and a text2text pipeline, returns a ~100-word story.
    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
    results = pipe(
        prompt,
        max_length=120,            # increased max length slightly
        min_length=80,             # minimum generated tokens
        do_sample=True,            # enable sampling
        top_k=100,                 # sample from top_k tokens
        top_p=0.9,                 # nucleus sampling threshold
        temperature=0.7,           # sampling temperature
        repetition_penalty=1.1,    # discourage repetition
        no_repeat_ngram_size=4,    # block repeated n-grams
        early_stopping=False
    )
    raw = results[0]["generated_text"].strip()  # full generated text
    # strip out the prompt if it echoes back - make comparison case-insensitive
    if raw.lower().startswith(prompt.lower()):
        raw = raw[len(prompt):].strip()

    # trim to last complete sentence ending in . ! or ?
    match = re.search(r'[.!?]', raw[::-1]) # Search for the first punctuation from the end
    if match:
        raw = raw[:len(raw) - match.start()] # Trim at that position
    elif len(raw) > 80: # If no punctuation found but story is long, trim to a reasonable length
         raw = raw[:80] + "..."

    return sentence_case(raw)

def tts_bytes(text: str, tts_pipe) -> bytes:
    # Given a text string and a tts pipeline, returns WAV-format bytes.
    # Clean up text for TTS - remove leading/trailing quotes, etc.
    cleaned_text = re.sub(r'^["\']|["\']$', '', text).strip()
    # Basic punctuation cleaning (optional, depending on TTS model)
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text) # Replace multiple periods with one
    cleaned_text = cleaned_text.replace('…', '...') # Replace ellipsis char with dots
    # Add a period if the text doesn't end with punctuation (helps TTS model finalize)
    if cleaned_text and cleaned_text[-1] not in '.!?':
         cleaned_text += '.'

    output = tts_pipe(cleaned_text)
    # pipeline may return list or single dict
    result = output[0] if isinstance(output, list) else output
    audio_array = result["audio"]            # numpy array: (channels, samples) or (samples,)
    rate = result["sampling_rate"]           # sampling rate integer

    # ensure audio_array is 2D (samples, channels) for consistent handling
    if audio_array.ndim == 1:
        data = audio_array[:, np.newaxis] # add channel dimension
    else:
        data = audio_array.T # transpose from (channels, samples) to (samples, channels)


    # convert float32 [-1..1] to int16 PCM [-32768..32767]
    pcm = (data * 32767).astype(np.int16)

    buffer = io.BytesIO()
    wf = wave.open(buffer, "wb")
    wf.setnchannels(data.shape[1])         # number of channels
    wf.setsampwidth(2)                     # 16 bits = 2 bytes
    wf.setframerate(rate)                  # samples per second
    wf.writeframes(pcm.tobytes())          # write PCM data
    wf.close()
    buffer.seek(0)
    return buffer.read()                   # return raw WAV bytes

# 3) STREAMLIT USER INTERFACE
st.set_page_config(page_title="Imagine & Narrate", page_icon="✨", layout="centered")
st.title("✨ Imagine & Narrate")
st.write("Upload any image below to see AI imagine and narrate a story about it!")

# -- Upload image widget --
uploaded = st.file_uploader(
    "Choose an image file",
    type=["jpg", "jpeg", "png"]
)
if not uploaded:
    st.info("➡️ Upload an image above to start the magic!")
    st.stop()

# Load the uploaded file into a PIL Image
try:
    img = Image.open(uploaded)
except Exception as e:
    st.error(f"Error loading image: {e}")
    st.stop()


# -- Step 1: Display the image --
st.subheader("📸 Your Visual Input")
st.image(img, use_container_width=True)
st.divider()

# -- Step 2: Generate and display caption --
st.subheader("🧠 Generating Insights")
with st.spinner("Scanning image for key elements…"):
    captioner = load_captioner()
    raw_caption = caption_image(img, captioner)
    if not raw_caption:
         st.warning("Could not generate a caption for the image.")
         st.stop()
    caption = sentence_case(raw_caption)
st.markdown(f"**Identified Scene:** {caption}")
st.divider()

# -- Step 3: Generate and display story --
st.subheader("📖 Crafting a Narrative")
with st.spinner("Writing a compelling story…"):
    story_pipe = load_story_pipe()
    story = story_from_caption(caption, story_pipe)
    if not story or story.strip() == '...': # Check for empty or minimal story
         st.warning("Could not generate a meaningful story from the caption.")
         st.stop()
st.write(story)
st.divider()

# -- Step 4: Synthesize and play audio --
st.subheader("👂 Hear the Story")
with st.spinner("Synthesizing audio narration…"):
    tts_pipe = load_tts_pipe()
    try:
        audio_bytes = tts_bytes(story, tts_pipe)
        st.audio(audio_bytes, format="audio/wav")
    except Exception as e:
        st.error(f"Error generating audio: {e}")


# Celebration animation
st.balloons()