Spaces:

justREE
/

StoryTeller

Sleeping

File size: 5,235 Bytes

d732c64
 
 
 
 
 
 
92fdc22
 
a791bff
 
b88893e
 
 
92fdc22
b88893e
a791bff
 
b88893e
 
 
 
92fdc22
b88893e
a791bff
 
 
b88893e
 
 
92fdc22
272411e
a791bff
b88893e
 
a791bff
 
 
92fdc22
b88893e
92fdc22
 
 
a791bff
92fdc22
a791bff
92fdc22
 
 
b88893e
 
92fdc22
 
b88893e
92fdc22
b88893e
 
92fdc22
 
1292c85
b88893e
 
1292c85
654e0e4
1292c85
 
 
 
 
 
 
b88893e
 
92fdc22
 
 
 
b88893e
 
654e0e4
f137cdb
b88893e
92fdc22
 
 
 
 
 
 
 
b88893e
92fdc22
 
 
 
 
 
 
 
b88893e
92fdc22
 
 
b88893e
 
 
 
92fdc22
 
f74c9b9
92fdc22
 
b88893e
92fdc22
 
 
b88893e
92fdc22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b88893e
92fdc22
 
 
 
 
 
 
 
 
 
 
b88893e
 
92fdc22
 
 
 
 
 
 
 
 
b88893e
92fdc22
 
 
 
 
29df25d

import io                   
import wave                 
import re                   
import streamlit as st      
from transformers import pipeline  
from PIL import Image       
import numpy as np          

# 1) CACHE & LOAD MODELS (CPU only)
@st.cache_resource(show_spinner=False)
def load_captioner():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_story_pipe():
    return pipeline(
        "text2text-generation",
        model="google/flan-t5-base",
        device=-1  # force CPU
    )

@st.cache_resource(show_spinner=False)
def load_tts_pipe():
    return pipeline(
        "text-to-speech",
        model="facebook/mms-tts-eng",
        device=-1  # force CPU
    )

# 2) HELPER FUNCTIONS
def sentence_case(text: str) -> str:
    parts = re.split(r'([.!?])', text)
    out = []
    for i in range(0, len(parts) - 1, 2):
        sentence = parts[i].strip()
        delimiter = parts[i + 1]
        if sentence:
            formatted = sentence[0].upper() + sentence[1:]
            out.append(f"{formatted}{delimiter}")
    if len(parts) % 2:
        last = parts[-1].strip()
        if last:
            formatted = last[0].upper() + last[1:]
            out.append(formatted)
    return " ".join(" ".join(out).split())

def caption_image(img: Image.Image, captioner) -> str:
    if img.mode != "RGB":
        img = img.convert("RGB")
    results = captioner(img)
    return (results[0].get("generated_text", "") if results else "")

def story_from_caption(caption: str, pipe) -> str:
    if not caption:
        return "Could not generate a story without a caption."
    prompt = f"Write a creative imaginary 50–100 word story using this scene: {caption}"
    results = pipe(
        prompt,
        max_length=100,
        min_length=80,
        do_sample=True,           
        top_k=100,                
        top_p=0.9,              
        temperature=0.5,         
        repetition_penalty=1.1,  
        no_repeat_ngram_size=4,   
        early_stopping=False
    )
    raw = results[0]["generated_text"].strip()
    raw = re.sub(re.escape(prompt), "", raw, flags=re.IGNORECASE).strip()
    idx = max(raw.rfind("."), raw.rfind("!"), raw.rfind("?"))
    if idx != -1:
        raw = raw[:idx+1]
    return sentence_case(raw)



def tts_bytes(text: str, tts_pipe) -> bytes:
    if not text:
        return b""
    cleaned = re.sub(r'^["\']|["\']$', '', text).strip()
    cleaned = re.sub(r'\.{2,}', '.', cleaned).replace('…', '...')
    if cleaned[-1] not in ".!?":
        cleaned += "."
    cleaned = " ".join(cleaned.split())
    output = tts_pipe(cleaned)
    result = output[0] if isinstance(output, list) else output
    audio_array = result.get("audio")
    rate = result.get("sampling_rate")
    if audio_array is None or rate is None:
        return b""
    if audio_array.ndim == 1:
        data = audio_array[:, np.newaxis]
    else:
        data = audio_array.T
    pcm = (data * 32767).astype(np.int16)
    buf = io.BytesIO()
    wf = wave.open(buf, "wb")
    wf.setnchannels(data.shape[1])
    wf.setsampwidth(2)
    wf.setframerate(rate)
    wf.writeframes(pcm.tobytes())
    wf.close()
    buf.seek(0)
    return buf.read()

# 3) STREAMLIT USER INTERFACE
st.set_page_config(page_title="✨ Imagine & Narrate", page_icon="✨", layout="centered")

# Persist upload across reruns
if "uploaded_file" not in st.session_state:
    st.session_state.uploaded_file = None

new_upload = st.file_uploader(
    "Choose an image file",
    type=["jpg", "jpeg", "png"]
)
if new_upload is not None:
    st.session_state.uploaded_file = new_upload

if st.session_state.uploaded_file is None:
    st.title("✨ Imagine & Narrate")
    st.info("➡️ Upload an image above to start the magic!")
    st.stop()

uploaded = st.session_state.uploaded_file
try:
    img = Image.open(uploaded)
except Exception as e:
    st.error(f"Could not load the image: {e}")
    st.stop()

st.title("✨ Imagine & Narrate")
st.subheader("📸 Your Visual Input")
st.image(img, caption=uploaded.name, use_container_width=True)
st.divider()

# Step 1: Generate Caption
st.subheader("🧠 Generating Caption")
with st.spinner("Analyzing image..."):
    captioner = load_captioner()
    raw_caption = caption_image(img, captioner)
    if not raw_caption:
        st.error("Failed to generate caption.")
        st.stop()
    caption = sentence_case(raw_caption)
st.markdown(f"**Identified Scene:** {caption}")
st.divider()

# Step 2: Generate Story
st.subheader("📖 Crafting a Story")
with st.spinner("Writing story..."):
    story_pipe = load_story_pipe()
    story = story_from_caption(caption, story_pipe)
    if not story or story.strip() in {".", "..", "..."}:
        st.error("Failed to generate story.")
        st.stop()
st.write(story)
st.divider()

# Step 3: Synthesize Audio
st.subheader("👂 Hear the Story")
with st.spinner("Synthesizing audio..."):
    tts_pipe = load_tts_pipe()
    audio_bytes = tts_bytes(story, tts_pipe)
    if not audio_bytes:
        st.warning("Audio generation failed.")
    else:
        st.audio(audio_bytes, format="audio/wav")
st.balloons()