Szeyu commited on
Commit
2e8ed85
·
verified ·
1 Parent(s): 32f5ce7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -6
app.py CHANGED
@@ -66,7 +66,7 @@ def get_caption(image_bytes):
66
  and generates a caption.
67
  """
68
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
69
- # Resize to speed up processing
70
  image.thumbnail((256, 256))
71
  caption = st.session_state.captioner(image)[0]["generated_text"]
72
  return caption
@@ -96,12 +96,32 @@ def get_story(caption):
96
  def get_audio(story):
97
  """
98
  Converts the generated story text into audio.
99
- Splits the text into 300-character chunks to reduce repeated TTS calls,
100
- concatenates the resulting audio chunks, and returns an in-memory WAV buffer.
101
  """
102
  chunks = textwrap.wrap(story, width=300)
103
- audio_chunks = [st.session_state.tts(chunk)["audio"].squeeze() for chunk in chunks]
104
- audio = np.concatenate(audio_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  buffer = io.BytesIO()
106
  sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
107
  buffer.seek(0)
@@ -111,7 +131,7 @@ def get_audio(story):
111
  uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
112
  if uploaded_file is not None:
113
  try:
114
- load_models() # Make sure models are loaded
115
  image_bytes = uploaded_file.getvalue()
116
  # Display the uploaded image
117
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
 
66
  and generates a caption.
67
  """
68
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
69
+ # Resize image to 256x256 maximum for faster processing
70
  image.thumbnail((256, 256))
71
  caption = st.session_state.captioner(image)[0]["generated_text"]
72
  return caption
 
96
  def get_audio(story):
97
  """
98
  Converts the generated story text into audio.
99
+ Splits the text into 300-character chunks to reduce repeated TTS calls.
100
+ Checks each chunk, and if no valid audio is produced, creates a brief default silent audio.
101
  """
102
  chunks = textwrap.wrap(story, width=300)
103
+ audio_chunks = []
104
+ for chunk in chunks:
105
+ try:
106
+ output = st.session_state.tts(chunk)
107
+ # Some pipelines return a list; if so, use the first element.
108
+ if isinstance(output, list):
109
+ output = output[0]
110
+ if "audio" in output:
111
+ # Ensure the audio is a numpy array and squeeze any extra dimensions.
112
+ audio_array = np.array(output["audio"]).squeeze()
113
+ audio_chunks.append(audio_array)
114
+ except Exception as e:
115
+ # Skip any chunk that raises an error.
116
+ continue
117
+
118
+ # If no audio was generated, produce 1 second of silence as a fallback.
119
+ if not audio_chunks:
120
+ sr = st.session_state.tts.model.config.sampling_rate
121
+ audio = np.zeros(sr, dtype=np.float32)
122
+ else:
123
+ audio = np.concatenate(audio_chunks)
124
+
125
  buffer = io.BytesIO()
126
  sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
127
  buffer.seek(0)
 
131
  uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
132
  if uploaded_file is not None:
133
  try:
134
+ load_models() # Ensure models are loaded
135
  image_bytes = uploaded_file.getvalue()
136
  # Display the uploaded image
137
  image = Image.open(io.BytesIO(image_bytes)).convert("RGB")