Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from kokoro import KPipeline
|
3 |
+
import streamlit as st
|
4 |
+
from PIL import Image
|
5 |
+
import torch
|
6 |
+
|
7 |
+
# 1) Vision→Text: BLIP-2
|
8 |
+
captioner = pipeline(
|
9 |
+
"image-text-to-text",
|
10 |
+
model="Salesforce/blip2-opt-2.7b",
|
11 |
+
device="cuda" if torch.cuda.is_available() else "cpu",
|
12 |
+
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
|
13 |
+
)
|
14 |
+
|
15 |
+
# 2) TTS: Kokoro
|
16 |
+
tts = KPipeline(lang_code="a")
|
17 |
+
|
18 |
+
def generate_story(img: Image.Image, max_words: int = 100) -> str:
|
19 |
+
out = captioner(
|
20 |
+
img,
|
21 |
+
text="Tell me a fun 50–100 word story about this image.",
|
22 |
+
max_new_tokens=200,
|
23 |
+
num_beams=4,
|
24 |
+
return_full_text=False,
|
25 |
+
)
|
26 |
+
story = out[0]["generated_text"]
|
27 |
+
return " ".join(story.split()[:max_words])
|
28 |
+
|
29 |
+
# 3) Streamlit UI
|
30 |
+
st.set_page_config(page_title="BLIP-2 Storyteller")
|
31 |
+
st.title("📖 BLIP-2 + Kokoro Storyteller")
|
32 |
+
st.write("Upload an image and hear a 50–100 word story!")
|
33 |
+
|
34 |
+
uploaded = st.file_uploader("Choose an image…", type=["png", "jpg", "jpeg"])
|
35 |
+
if uploaded:
|
36 |
+
img = Image.open(uploaded)
|
37 |
+
st.image(img, use_column_width=True)
|
38 |
+
|
39 |
+
if st.button("Tell My Story"):
|
40 |
+
with st.spinner("Generating…"):
|
41 |
+
story = generate_story(img)
|
42 |
+
st.markdown(f"**Story ({len(story.split())} words):**\n\n{story}")
|
43 |
+
|
44 |
+
with st.spinner("Speaking…"):
|
45 |
+
gen = tts(story, voice="af_heart")
|
46 |
+
_, _, audio = next(gen)
|
47 |
+
st.audio(audio, format="audio/wav", sample_rate=24000)
|
48 |
+
st.success("Enjoy! 🎉")
|