Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import streamlit as st
|
2 |
-
from transformers import AutoProcessor,
|
3 |
import torch
|
4 |
from PIL import Image
|
5 |
import io
|
@@ -9,8 +9,8 @@ from kokoro import KPipeline # For text-to-speech
|
|
9 |
|
10 |
# Load models globally to avoid reloading them repeatedly
|
11 |
# Image-to-Text model
|
12 |
-
processor = AutoProcessor.from_pretrained("
|
13 |
-
caption_model =
|
14 |
|
15 |
# Text-to-Story model
|
16 |
story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
|
@@ -52,7 +52,7 @@ def generate_audio(story):
|
|
52 |
concatenated_audio = np.concatenate(audio_segments)
|
53 |
# Write to a BytesIO buffer instead of saving to disk
|
54 |
audio_buffer = io.BytesIO()
|
55 |
-
|
56 |
audio_buffer.seek(0)
|
57 |
return audio_buffer
|
58 |
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText, pipeline
|
3 |
import torch
|
4 |
from PIL import Image
|
5 |
import io
|
|
|
9 |
|
10 |
# Load models globally to avoid reloading them repeatedly
|
11 |
# Image-to-Text model
|
12 |
+
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
|
13 |
+
caption_model = AutoModelForImageTextToText.from_pretrained("Salesforce/blip-image-captioning-large")
|
14 |
|
15 |
# Text-to-Story model
|
16 |
story_generator = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
|
|
|
52 |
concatenated_audio = np.concatenate(audio_segments)
|
53 |
# Write to a BytesIO buffer instead of saving to disk
|
54 |
audio_buffer = io.BytesIO()
|
55 |
+
sf.write(audio_buffer, concatenated_audio, 24000, format='WAV')
|
56 |
audio_buffer.seek(0)
|
57 |
return audio_buffer
|
58 |
|