Spaces:

justREE
/

StoryTeller

Sleeping

App Files Files Community

justREE commited on May 1

Commit

b88893e

verified ·

1 Parent(s): fd3e00c

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -78

app.py CHANGED Viewed

@@ -4,108 +4,156 @@ import io
 import wave
 import re
 import streamlit as st
-from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan
-from datasets import load_dataset
 from PIL import Image
 import numpy as np
-import torch
-# ─────────────────────────────────────────────────────────────
-# 1) LOAD PIPELINES
-# ─────────────────────────────────────────────────────────────
 @st.cache_resource(show_spinner=False)
 def load_captioner():
-    return pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device="cpu")
 @st.cache_resource(show_spinner=False)
-def load_story_generator():
-    return pipeline("text-generation", model="microsoft/Phi-4-mini-reasoning", device="cpu")
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
-    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-    model = pipeline("text-to-speech", model="microsoft/speecht5_tts", device="cpu")
-    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-    speaker_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-    speaker_embedding = torch.tensor(speaker_dataset[7306]["xvector"]).unsqueeze(0)
-    return processor, model, vocoder, speaker_embedding
-# ─────────────────────────────────────────────────────────────
-# 2) PIPELINE FUNCTIONS
-# ─────────────────────────────────────────────────────────────
-def get_caption(image, captioner):
-    return captioner(image)[0]['generated_text']
-def generate_story(caption, generator):
-    prompt = f"Write a short, magical story for children aged 3 to 10 based on this scene: {caption}. Keep it under 100 words."
-    outputs = generator(
-        prompt,
-        max_new_tokens=120,
-        temperature=0.8,
-        top_p=0.95,
-        do_sample=True
     )
-    story = outputs[0]["generated_text"]
-    return clean_story_output(story, prompt)
-def clean_story_output(story, prompt):
-    story = story[len(prompt):].strip() if story.startswith(prompt) else story
-    if "." in story:
-        story = story[: story.rfind(".") + 1]
-    return sentence_case(story)
-def sentence_case(text):
     parts = re.split(r'([.!?])', text)
     out = []
     for i in range(0, len(parts) - 1, 2):
         sentence = parts[i].strip().capitalize()
-        out.append(f"{sentence}{parts[i + 1]}")
     if len(parts) % 2:
         last = parts[-1].strip().capitalize()
         if last:
             out.append(last)
     return " ".join(out)
-def convert_to_audio(text, processor, tts_pipe, vocoder, speaker_embedding):
-    inputs = processor(text=text, return_tensors="pt")
-    speech = tts_pipe.model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
-    pcm = (speech.numpy() * 32767).astype(np.int16)
     buffer = io.BytesIO()
-    with wave.open(buffer, "wb") as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(16000)
-        wf.writeframes(pcm.tobytes())
     buffer.seek(0)
     return buffer.read()
-# ─────────────────────────────────────────────────────────────
-# 3) STREAMLIT APP UI
-# ─────────────────────────────────────────────────────────────
-st.set_page_config(page_title="Magic Storyteller", layout="centered")
-st.title("Magic Storyteller")
-st.markdown("Upload an image to generate a magical story and hear it read aloud!")
-uploaded = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
-if uploaded:
-    image = Image.open(uploaded)
-    st.image(image, caption="Your uploaded image", use_container_width=True)
-    if st.button("✨ Generate Story and Audio"):
-        st.subheader("🖼️ Step 1: Captioning")
-        captioner = load_captioner()
-        caption = get_caption(image, captioner)
-        st.markdown(f"**Caption:** {sentence_case(caption)}")
-        st.subheader("📖 Step 2: Story Generation")
-        story_pipe = load_story_generator()
-        story = generate_story(caption, story_pipe)
-        st.write(story)
-        st.subheader("🔊 Step 3: Listen to the Story")
-        processor, tts_pipe, vocoder, speaker_embedding = load_tts_pipe()
-        audio_bytes = convert_to_audio(story, processor, tts_pipe, vocoder, speaker_embedding)
-        st.audio(audio_bytes, format="audio/wav")
-        st.balloons()
-else:
-    st.info("Please upload an image to begin.")

 import wave
 import re
 import streamlit as st
+from transformers import pipeline
 from PIL import Image
 import numpy as np
+# 1) CACHE & LOAD MODELS
 @st.cache_resource(show_spinner=False)
 def load_captioner():
+    return pipeline(
+        "image-to-text",
+        model="Salesforce/blip-image-captioning-base",
+        device="cpu"
+    )
 @st.cache_resource(show_spinner=False)
+def load_story_pipe():
+    return pipeline(
+        "text2text-generation",
+        model="google/flan-t5-base",
+        device="cpu"
+    )
 @st.cache_resource(show_spinner=False)
 def load_tts_pipe():
+    return pipeline(
+        "text-to-speech",
+        model="facebook/mms-tts-eng",
+        device="cpu"
     )
+# 2) HELPER FUNCTIONS
+def sentence_case(text: str) -> str:
     parts = re.split(r'([.!?])', text)
     out = []
     for i in range(0, len(parts) - 1, 2):
         sentence = parts[i].strip().capitalize()
+        delimiter = parts[i + 1]
+        out.append(f"{sentence}{delimiter}")
     if len(parts) % 2:
         last = parts[-1].strip().capitalize()
         if last:
             out.append(last)
     return " ".join(out)
+def caption_image(img: Image.Image, captioner) -> str:
+    results = captioner(img)
+    if not results:
+        return ""
+    return results[0].get("generated_text", "")
+def story_from_caption(caption: str, pipe) -> str:
+    prompt = f"Write a vivid, imaginative ~100-word story about this scene: {caption}"
+    results = pipe(
+        prompt,
+        max_length=100,
+        min_length=80,
+        do_sample=True,
+        top_k=100,
+        top_p=0.9,
+        temperature=0.7,
+        repetition_penalty=1.1,
+        no_repeat_ngram_size=4,
+        early_stopping=False
+    )
+    raw = results[0]["generated_text"].strip()
+    if raw.lower().startswith(prompt.lower()):
+        raw = raw[len(prompt):].strip()
+    if "." in raw:
+        raw = raw[: raw.rfind(".") + 1]
+    return sentence_case(raw)
+def tts_bytes(text: str, tts_pipe) -> bytes:
+    output = tts_pipe(text)
+    result = output[0] if isinstance(output, list) else output
+    audio_array = result["audio"]
+    rate = result["sampling_rate"]
+    data = audio_array.T if audio_array.ndim == 2 else audio_array
+    pcm = (data * 32767).astype(np.int16)
     buffer = io.BytesIO()
+    wf = wave.open(buffer, "wb")
+    wf.setnchannels(1 if data.ndim == 1 else data.shape[1])
+    wf.setsampwidth(2)
+    wf.setframerate(rate)
+    wf.writeframes(pcm.tobytes())
+    wf.close()
     buffer.seek(0)
     return buffer.read()
+# 3) STREAMLIT UI ENHANCEMENTS
+st.set_page_config(
+    page_title="Media Magic Storyteller",
+    page_icon="🎨",
+    layout="wide"
+)
+# Sidebar
+with st.sidebar:
+    st.header("🎨 Media Magic")
+    st.markdown(
+        "Upload an image and click 'Generate' to craft a magical story."
+    )
+    st.markdown("---")
+    st.markdown("1️⃣ Upload your image\n2️⃣ Click 'Generate'\n3️⃣ Read & Listen!")
+    st.markdown("---")
+    st.markdown("Built with 💖 using Hugging Face & Streamlit.")
+# Main
+st.title("✨ Media Magic Storyteller")
+col1, col2 = st.columns([1, 2])
+with col1:
+    uploaded = st.file_uploader("Upload Image", type=["jpg", "jpeg", "png"])
+    if uploaded:
+        st.image(uploaded, caption="Your Image", use_container_width=True)
+with col2:
+    st.write("### Your Story")
+    placeholder = st.empty()
+if uploaded and st.button("🚀 Generate Story & Audio"):
+    progress = st.progress(0)
+    # Captioning
+    progress.progress(10)
+    captioner = load_captioner()
+    caption = sentence_case(caption_image(Image.open(uploaded), captioner))
+    st.subheader("🖼️ Caption")
+    st.info(caption)
+    # Story
+    progress.progress(40)
+    story_pipe = load_story_pipe()
+    story = story_from_caption(caption, story_pipe)
+    st.subheader("📖 Story")
+    st.write(story)
+    # Audio
+    progress.progress(70)
+    tts_pipe = load_tts_pipe()
+    audio = tts_bytes(story, tts_pipe)
+    st.subheader("🔊 Audio")
+    st.audio(audio, format="audio/wav")
+    progress.progress(100)
+    st.balloons()
+# Footer
+st.markdown("---")
+st.markdown("&copy; 2025 Media Magic | https://huggingface.co")