File size: 3,448 Bytes
a67a914 808ca72 a67a914 9bc975a 7d65732 a67a914 9bc975a a67a914 808ca72 a67a914 9bc975a a67a914 9bc975a e881a5a 9bc975a e881a5a 9bc975a 808ca72 9bc975a a67a914 9bc975a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import streamlit as st
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
import torch
from PIL import Image
import io
import numpy as np
from kokoro import KPipeline # For text-to-speech
#import soundfile as sf
# Load models globally to avoid reloading them repeatedly
# Image-to-Text model
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
caption_model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large")
# Text-to-Story model
story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
# Text-to-Speech model
audio_pipeline = KPipeline(lang_code='a')
# Function to generate a caption from an image
def generate_caption(image_bytes):
image = Image.open(io.BytesIO(image_bytes))
inputs = processor(images=image, text="Generate a caption:", return_tensors="pt")
outputs = caption_model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)
return caption
# Function to generate a story from a caption
def generate_story(caption):
prompt = f"Based on the description '{caption}', tell a short story for children aged 3 to 10 in no more than 100 words."
story_output = story_generator(prompt, max_length=150, num_return_sequences=1)
story = story_output[0]["generated_text"]
# Truncate to 100 words if necessary
story_words = story.split()
if len(story_words) > 100:
story = " ".join(story_words[:100])
return story
# Function to generate audio from a story
def generate_audio(story):
audio_generator = audio_pipeline(
story, voice='af_heart', speed=1, split_pattern=r'\n+'
)
audio_segments = []
# Collect all audio segments
for i, (gs, ps, audio) in enumerate(audio_generator):
audio_segments.append(audio)
if not audio_segments:
return None
# Concatenate audio segments into a single array
concatenated_audio = np.concatenate(audio_segments)
# Write to a BytesIO buffer instead of saving to disk
audio_buffer = io.BytesIO()
sf.write(audio_buffer, concatenated_audio, 24000, format='WAV')
audio_buffer.seek(0)
return audio_buffer
# Streamlit UI
st.title("Image to Story Audio Generator")
st.write("Upload an image to generate a short children's story (≤100 words) as audio.")
uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
image_bytes = uploaded_file.read()
st.image(image_bytes, caption="Uploaded Image", use_column_width=True)
# Generate and display caption
with st.spinner("Generating caption..."):
caption = generate_caption(image_bytes)
st.write("**Generated Caption:**")
st.write(caption)
# Generate and display story
with st.spinner("Generating story..."):
story = generate_story(caption)
st.write("**Generated Story:**")
st.write(story)
# Generate and display audio
with st.spinner("Generating audio..."):
audio_buffer = generate_audio(story)
if audio_buffer:
st.audio(audio_buffer, format="audio/wav")
st.download_button(
label="Download Story Audio",
data=audio_buffer,
file_name="story_audio.wav",
mime="audio/wav"
)
else:
st.error("Failed to generate audio.") |