Update app.py
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ def get_caption(image_bytes):
|
|
66 |
and generates a caption.
|
67 |
"""
|
68 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
69 |
-
# Resize to
|
70 |
image.thumbnail((256, 256))
|
71 |
caption = st.session_state.captioner(image)[0]["generated_text"]
|
72 |
return caption
|
@@ -96,12 +96,32 @@ def get_story(caption):
|
|
96 |
def get_audio(story):
|
97 |
"""
|
98 |
Converts the generated story text into audio.
|
99 |
-
Splits the text into 300-character chunks to reduce repeated TTS calls
|
100 |
-
|
101 |
"""
|
102 |
chunks = textwrap.wrap(story, width=300)
|
103 |
-
audio_chunks = [
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
buffer = io.BytesIO()
|
106 |
sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
|
107 |
buffer.seek(0)
|
@@ -111,7 +131,7 @@ def get_audio(story):
|
|
111 |
uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
|
112 |
if uploaded_file is not None:
|
113 |
try:
|
114 |
-
load_models() #
|
115 |
image_bytes = uploaded_file.getvalue()
|
116 |
# Display the uploaded image
|
117 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
|
66 |
and generates a caption.
|
67 |
"""
|
68 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
69 |
+
# Resize image to 256x256 maximum for faster processing
|
70 |
image.thumbnail((256, 256))
|
71 |
caption = st.session_state.captioner(image)[0]["generated_text"]
|
72 |
return caption
|
|
|
96 |
def get_audio(story):
|
97 |
"""
|
98 |
Converts the generated story text into audio.
|
99 |
+
Splits the text into 300-character chunks to reduce repeated TTS calls.
|
100 |
+
Checks each chunk, and if no valid audio is produced, creates a brief default silent audio.
|
101 |
"""
|
102 |
chunks = textwrap.wrap(story, width=300)
|
103 |
+
audio_chunks = []
|
104 |
+
for chunk in chunks:
|
105 |
+
try:
|
106 |
+
output = st.session_state.tts(chunk)
|
107 |
+
# Some pipelines return a list; if so, use the first element.
|
108 |
+
if isinstance(output, list):
|
109 |
+
output = output[0]
|
110 |
+
if "audio" in output:
|
111 |
+
# Ensure the audio is a numpy array and squeeze any extra dimensions.
|
112 |
+
audio_array = np.array(output["audio"]).squeeze()
|
113 |
+
audio_chunks.append(audio_array)
|
114 |
+
except Exception as e:
|
115 |
+
# Skip any chunk that raises an error.
|
116 |
+
continue
|
117 |
+
|
118 |
+
# If no audio was generated, produce 1 second of silence as a fallback.
|
119 |
+
if not audio_chunks:
|
120 |
+
sr = st.session_state.tts.model.config.sampling_rate
|
121 |
+
audio = np.zeros(sr, dtype=np.float32)
|
122 |
+
else:
|
123 |
+
audio = np.concatenate(audio_chunks)
|
124 |
+
|
125 |
buffer = io.BytesIO()
|
126 |
sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
|
127 |
buffer.seek(0)
|
|
|
131 |
uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
|
132 |
if uploaded_file is not None:
|
133 |
try:
|
134 |
+
load_models() # Ensure models are loaded
|
135 |
image_bytes = uploaded_file.getvalue()
|
136 |
# Display the uploaded image
|
137 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|