Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -214,9 +214,7 @@ def call_edit_image(image, prompt, show_thinking, cfg_text_scale, cfg_img_scale,
|
|
| 214 |
|
| 215 |
DEFAULT_WELCOME_MESSAGE = {
|
| 216 |
"role": "assistant",
|
| 217 |
-
"content":
|
| 218 |
-
{"type": "text", "content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt."}
|
| 219 |
-
],
|
| 220 |
"key": "welcome"
|
| 221 |
}
|
| 222 |
|
|
@@ -264,7 +262,8 @@ class GradioApp:
|
|
| 264 |
# This is simplified; best-gradio-ui.py stores settings per conversation
|
| 265 |
current_turn_settings = {
|
| 266 |
"mode": mode,
|
| 267 |
-
|
|
|
|
| 268 |
# TTI
|
| 269 |
"tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
|
| 270 |
# Edit
|
|
@@ -274,20 +273,31 @@ class GradioApp:
|
|
| 274 |
}
|
| 275 |
self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
|
| 276 |
|
| 277 |
-
|
| 278 |
if text_input:
|
| 279 |
-
|
| 280 |
if image_input and mode in ["Image Edit", "Image Understanding"]:
|
| 281 |
-
#
|
| 282 |
-
#
|
| 283 |
-
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
|
| 289 |
-
history.append(
|
| 290 |
-
history.append({"role": "assistant", "content":
|
| 291 |
|
| 292 |
yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
|
| 293 |
|
|
@@ -297,7 +307,8 @@ class GradioApp:
|
|
| 297 |
output_text = None
|
| 298 |
thinking_text = None
|
| 299 |
|
| 300 |
-
|
|
|
|
| 301 |
|
| 302 |
if mode == "Text to Image":
|
| 303 |
output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
|
|
@@ -323,17 +334,18 @@ class GradioApp:
|
|
| 323 |
|
| 324 |
bot_response_content = []
|
| 325 |
if thinking_text:
|
| 326 |
-
|
|
|
|
| 327 |
if output_text:
|
| 328 |
-
bot_response_content.append({"type": "text", "
|
| 329 |
-
if output_image:
|
| 330 |
-
bot_response_content.append({"type": "image", "
|
| 331 |
|
| 332 |
if not bot_response_content:
|
| 333 |
-
bot_response_content.append({"type": "text", "
|
| 334 |
|
| 335 |
-
|
| 336 |
-
history[-1]["
|
| 337 |
|
| 338 |
except Exception as e:
|
| 339 |
print(f"Error during processing: {e}")
|
|
|
|
| 214 |
|
| 215 |
DEFAULT_WELCOME_MESSAGE = {
|
| 216 |
"role": "assistant",
|
| 217 |
+
"content": "Hello! I am BAGEL, your multimodal assistant. How can I help you today? Select a mode and enter your prompt.",
|
|
|
|
|
|
|
| 218 |
"key": "welcome"
|
| 219 |
}
|
| 220 |
|
|
|
|
| 262 |
# This is simplified; best-gradio-ui.py stores settings per conversation
|
| 263 |
current_turn_settings = {
|
| 264 |
"mode": mode,
|
| 265 |
+
# Store PIL image directly if needed, or handle path carefully
|
| 266 |
+
"image_input": image_input, # Now storing the PIL image or None
|
| 267 |
# TTI
|
| 268 |
"tti_show_thinking": tti_show_thinking, "tti_cfg_text_scale": tti_cfg_text_scale, "tti_cfg_interval": tti_cfg_interval, "tti_timestep_shift": tti_timestep_shift, "tti_num_timesteps": tti_num_timesteps, "tti_cfg_renorm_min": tti_cfg_renorm_min, "tti_cfg_renorm_type": tti_cfg_renorm_type, "tti_max_think_token_n": tti_max_think_token_n, "tti_do_sample": tti_do_sample, "tti_text_temperature": tti_text_temperature, "tti_seed": tti_seed, "tti_image_ratio": tti_image_ratio,
|
| 269 |
# Edit
|
|
|
|
| 273 |
}
|
| 274 |
self.conversation_contexts[self.current_conversation_id]["settings"] = current_turn_settings
|
| 275 |
|
| 276 |
+
user_content_list = []
|
| 277 |
if text_input:
|
| 278 |
+
user_content_list.append({"type": "text", "text": text_input})
|
| 279 |
if image_input and mode in ["Image Edit", "Image Understanding"]:
|
| 280 |
+
# For 'messages' format, images are typically handled by passing them as part of a list of content dicts.
|
| 281 |
+
# Gradio's Chatbot with type='messages' can render PIL Images or file paths directly in the 'content' list.
|
| 282 |
+
user_content_list.append({"type": "image", "image": image_input}) # Assuming image_input is PIL
|
| 283 |
|
| 284 |
+
# Construct the user message for history
|
| 285 |
+
# If only text, content can be a string. If mixed, it's a list of dicts.
|
| 286 |
+
user_message_for_history = {
|
| 287 |
+
"role": "user",
|
| 288 |
+
"content": text_input if not image_input else user_content_list,
|
| 289 |
+
"key": str(uuid.uuid4())
|
| 290 |
+
}
|
| 291 |
+
if not text_input and image_input:
|
| 292 |
+
user_message_for_history["content"] = user_content_list
|
| 293 |
+
elif not user_content_list:
|
| 294 |
+
# Handle case where there's no input at all, though prior checks should prevent this.
|
| 295 |
+
gr.Warning("No input provided.")
|
| 296 |
+
return self._get_current_history(), gr.update(value=None), gr.update(value=None)
|
| 297 |
|
| 298 |
|
| 299 |
+
history.append(user_message_for_history)
|
| 300 |
+
history.append({"role": "assistant", "content": "Processing...", "key": str(uuid.uuid4())})
|
| 301 |
|
| 302 |
yield history, gr.update(value=None), gr.update(value=None) # chatbot, text_input, image_input (clear inputs)
|
| 303 |
|
|
|
|
| 307 |
output_text = None
|
| 308 |
thinking_text = None
|
| 309 |
|
| 310 |
+
# image_input is already a PIL image from the gr.Image component with type="pil"
|
| 311 |
+
pil_image_input = image_input
|
| 312 |
|
| 313 |
if mode == "Text to Image":
|
| 314 |
output_image, thinking_text = call_text_to_image(text_input, tti_show_thinking, tti_cfg_text_scale, tti_cfg_interval, tti_timestep_shift, tti_num_timesteps, tti_cfg_renorm_min, tti_cfg_renorm_type, tti_max_think_token_n, tti_do_sample, tti_text_temperature, tti_seed, tti_image_ratio)
|
|
|
|
| 334 |
|
| 335 |
bot_response_content = []
|
| 336 |
if thinking_text:
|
| 337 |
+
# For 'messages' type, each part of the content is a dict in a list
|
| 338 |
+
bot_response_content.append({"type": "text", "text": f"**Thinking Process:**\n{thinking_text}"})
|
| 339 |
if output_text:
|
| 340 |
+
bot_response_content.append({"type": "text", "text": output_text})
|
| 341 |
+
if output_image: # output_image should be a PIL Image
|
| 342 |
+
bot_response_content.append({"type": "image", "image": output_image})
|
| 343 |
|
| 344 |
if not bot_response_content:
|
| 345 |
+
bot_response_content.append({"type": "text", "text": "(No output generated)"})
|
| 346 |
|
| 347 |
+
# Update the last message (which was "Processing...")
|
| 348 |
+
history[-1]["content"] = bot_response_content_list[0]["text"] if len(bot_response_content_list) == 1 and bot_response_content_list[0]["type"] == "text" else bot_response_content_list
|
| 349 |
|
| 350 |
except Exception as e:
|
| 351 |
print(f"Error during processing: {e}")
|