Spaces:
Paused
Paused
by gemini
Browse files
app.py
CHANGED
@@ -178,13 +178,13 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
|
|
178 |
result = {"text": "", "image": None}
|
179 |
# Call inferencer with or without think parameter based on user choice
|
180 |
for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
|
181 |
-
print(type(i))
|
182 |
if type(i) == str:
|
183 |
result["text"] += i
|
184 |
else:
|
185 |
result["image"] = i
|
186 |
|
187 |
-
yield result["image"], result.get("text",
|
188 |
|
189 |
|
190 |
# Image Understanding function with thinking option and hyperparameters
|
@@ -192,7 +192,8 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
|
|
192 |
def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
193 |
do_sample=False, text_temperature=0.3, max_new_tokens=512):
|
194 |
if image is None:
|
195 |
-
|
|
|
196 |
|
197 |
if isinstance(image, np.ndarray):
|
198 |
image = Image.fromarray(image)
|
@@ -203,22 +204,24 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
|
203 |
inference_hyper = dict(
|
204 |
do_sample=do_sample,
|
205 |
temperature=text_temperature,
|
206 |
-
max_think_token_n=max_new_tokens, # Set max_length
|
207 |
)
|
208 |
|
209 |
-
|
210 |
# Use show_thinking parameter to control thinking process
|
211 |
for i in inferencer(image=image, text=prompt, think=show_thinking,
|
212 |
understanding_output=True, **inference_hyper):
|
213 |
if type(i) == str:
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
218 |
|
219 |
|
220 |
# Image Editing function with thinking option and hyperparameters
|
221 |
-
@spaces.GPU(duration=
|
222 |
def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
|
223 |
cfg_img_scale=2.0, cfg_interval=0.0,
|
224 |
timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
|
@@ -228,7 +231,8 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
|
|
228 |
set_seed(seed)
|
229 |
|
230 |
if image is None:
|
231 |
-
|
|
|
232 |
|
233 |
if isinstance(image, np.ndarray):
|
234 |
image = Image.fromarray(image)
|
@@ -257,7 +261,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
|
|
257 |
else:
|
258 |
result["image"] = i
|
259 |
|
260 |
-
yield result["image"], result.get("text", "")
|
261 |
|
262 |
# Helper function to load example images
|
263 |
def load_example_image(image_path):
|
@@ -267,201 +271,232 @@ def load_example_image(image_path):
|
|
267 |
print(f"Error loading example image: {e}")
|
268 |
return None
|
269 |
|
270 |
-
|
271 |
# Gradio UI
|
272 |
with gr.Blocks() as demo:
|
273 |
gr.Markdown("""
|
274 |
-
<div>
|
275 |
-
|
276 |
-
</div>
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
with gr.Row():
|
286 |
-
|
287 |
-
|
288 |
-
# Add hyperparameter controls in an accordion
|
289 |
-
with gr.Accordion("Inference Hyperparameters", open=False):
|
290 |
-
# 参数一排两个布局
|
291 |
-
with gr.Group():
|
292 |
-
with gr.Row():
|
293 |
-
seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1,
|
294 |
-
label="Seed", info="0 for random seed, positive for reproducible results")
|
295 |
-
image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"],
|
296 |
-
value="1:1", label="Image Ratio",
|
297 |
-
info="The longer size is fixed to 1024")
|
298 |
-
|
299 |
-
with gr.Row():
|
300 |
-
cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
|
301 |
-
label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0)")
|
302 |
-
cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1,
|
303 |
-
label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
|
304 |
-
|
305 |
-
with gr.Row():
|
306 |
-
cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
|
307 |
-
value="global", label="CFG Renorm Type",
|
308 |
-
info="If the genrated image is blurry, use 'global'")
|
309 |
-
cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
|
310 |
-
label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
|
311 |
-
|
312 |
-
with gr.Row():
|
313 |
-
num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
|
314 |
-
label="Timesteps", info="Total denoising steps")
|
315 |
-
timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, interactive=True,
|
316 |
-
label="Timestep Shift", info="Higher values for layout, lower for details")
|
317 |
-
|
318 |
-
# Thinking parameters in a single row
|
319 |
-
thinking_params = gr.Group(visible=False)
|
320 |
-
with thinking_params:
|
321 |
-
with gr.Row():
|
322 |
-
do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
|
323 |
-
max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
|
324 |
-
label="Max Think Tokens", info="Maximum number of tokens for thinking")
|
325 |
-
text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
|
326 |
-
label="Temperature", info="Controls randomness in text generation")
|
327 |
-
|
328 |
-
thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
329 |
-
img_output = gr.Image(label="Generated Image")
|
330 |
-
gen_btn = gr.Button("Generate")
|
331 |
|
332 |
-
#
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
|
|
|
|
|
|
|
|
352 |
|
353 |
-
|
|
|
|
|
|
|
354 |
with gr.Row():
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
with gr.Column(scale=1):
|
363 |
-
edit_image_output = gr.Image(label="Result")
|
364 |
-
edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
|
365 |
-
|
366 |
with gr.Row():
|
367 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
|
369 |
-
#
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
|
|
|
|
|
|
|
|
390 |
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
|
|
|
|
|
|
396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
|
398 |
-
#
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
|
|
|
|
|
|
|
|
431 |
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
|
|
466 |
|
467 |
demo.launch()
|
|
|
178 |
result = {"text": "", "image": None}
|
179 |
# Call inferencer with or without think parameter based on user choice
|
180 |
for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
|
181 |
+
# print(type(i)) # For debugging stream
|
182 |
if type(i) == str:
|
183 |
result["text"] += i
|
184 |
else:
|
185 |
result["image"] = i
|
186 |
|
187 |
+
yield result["image"], result.get("text", "")
|
188 |
|
189 |
|
190 |
# Image Understanding function with thinking option and hyperparameters
|
|
|
192 |
def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
|
193 |
do_sample=False, text_temperature=0.3, max_new_tokens=512):
|
194 |
if image is None:
|
195 |
+
yield "Please upload an image for understanding."
|
196 |
+
return
|
197 |
|
198 |
if isinstance(image, np.ndarray):
|
199 |
image = Image.fromarray(image)
|
|
|
204 |
inference_hyper = dict(
|
205 |
do_sample=do_sample,
|
206 |
temperature=text_temperature,
|
207 |
+
max_think_token_n=max_new_tokens, # Set max_length for text generation
|
208 |
)
|
209 |
|
210 |
+
result_text = ""
|
211 |
# Use show_thinking parameter to control thinking process
|
212 |
for i in inferencer(image=image, text=prompt, think=show_thinking,
|
213 |
understanding_output=True, **inference_hyper):
|
214 |
if type(i) == str:
|
215 |
+
result_text += i
|
216 |
+
yield result_text
|
217 |
+
# else: This branch seems unused in original, as understanding_output=True typically yields text.
|
218 |
+
# If it yielded image, it would be an intermediate. For final output, it's text.
|
219 |
+
# For now, we assume it only yields text.
|
220 |
+
yield result_text # Ensure final text is yielded
|
221 |
|
222 |
|
223 |
# Image Editing function with thinking option and hyperparameters
|
224 |
+
@spaces.GPU(duration=90)
|
225 |
def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
|
226 |
cfg_img_scale=2.0, cfg_interval=0.0,
|
227 |
timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
|
|
|
231 |
set_seed(seed)
|
232 |
|
233 |
if image is None:
|
234 |
+
yield None, "Please upload an image for editing." # Yield tuple for image/text
|
235 |
+
return
|
236 |
|
237 |
if isinstance(image, np.ndarray):
|
238 |
image = Image.fromarray(image)
|
|
|
261 |
else:
|
262 |
result["image"] = i
|
263 |
|
264 |
+
yield result["image"], result.get("text", "") # Yield tuple for image/text
|
265 |
|
266 |
# Helper function to load example images
|
267 |
def load_example_image(image_path):
|
|
|
271 |
print(f"Error loading example image: {e}")
|
272 |
return None
|
273 |
|
|
|
274 |
# Gradio UI
|
275 |
with gr.Blocks() as demo:
|
276 |
gr.Markdown("""
|
277 |
+
<div>
|
278 |
+
<img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
|
279 |
+
</div>
|
280 |
+
# BAGEL Multimodal Chatbot
|
281 |
+
Interact with BAGEL to generate images from text, edit existing images, or understand image content.
|
282 |
+
""")
|
283 |
+
|
284 |
+
# Chatbot display area
|
285 |
+
chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(None, "https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/BAGEL_favicon.png"))
|
286 |
+
|
287 |
+
# Input area
|
288 |
+
with gr.Row():
|
289 |
+
image_input = gr.Image(type="pil", label="Optional: Upload an Image (for Image Understanding/Edit)", scale=0.5, value=None)
|
290 |
|
291 |
+
with gr.Column(scale=1.5):
|
292 |
+
user_prompt = gr.Textbox(label="Your Message", placeholder="Type your prompt here...", lines=3)
|
293 |
+
|
294 |
+
with gr.Row():
|
295 |
+
mode_selector = gr.Radio(
|
296 |
+
choices=["Text to Image", "Image Understanding", "Image Edit"],
|
297 |
+
value="Text to Image",
|
298 |
+
label="Select Mode",
|
299 |
+
interactive=True
|
300 |
+
)
|
301 |
+
submit_btn = gr.Button("Send", variant="primary")
|
302 |
+
|
303 |
+
# Global/Shared Hyperparameters
|
304 |
+
with gr.Accordion("General Settings & Hyperparameters", open=False) as general_accordion:
|
305 |
with gr.Row():
|
306 |
+
show_thinking_global = gr.Checkbox(label="Show Thinking Process", value=False, info="Enable to see model's intermediate thinking text.")
|
307 |
+
seed_global = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, label="Seed", info="0 for random seed, positive for reproducible results.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
+
# Container for thinking-specific parameters, visibility controlled by show_thinking_global
|
310 |
+
thinking_params_container = gr.Group(visible=False)
|
311 |
+
with thinking_params_container:
|
312 |
+
gr.Markdown("#### Thinking Process Parameters (affect text generation)")
|
313 |
+
with gr.Row():
|
314 |
+
common_do_sample = gr.Checkbox(label="Enable Sampling", value=False, info="Enable sampling for text generation (otherwise greedy).")
|
315 |
+
common_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Text Temperature", info="Controls randomness in text generation (higher = more random).")
|
316 |
+
common_max_think_token_n = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Think Tokens / Max New Tokens", info="Maximum number of tokens for thinking (T2I/Edit) or generated text (Understanding).")
|
317 |
+
|
318 |
+
# T2I Hyperparameters
|
319 |
+
t2i_params_accordion = gr.Accordion("Text to Image Specific Parameters", open=False)
|
320 |
+
with t2i_params_accordion:
|
321 |
+
gr.Markdown("#### Text to Image Parameters")
|
322 |
+
with gr.Row():
|
323 |
+
t2i_image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"], value="1:1", label="Image Ratio", info="The longer size is fixed to 1024 pixels.")
|
324 |
+
with gr.Row():
|
325 |
+
t2i_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0 recommended).")
|
326 |
+
t2i_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="CFG Interval", info="Start of Classifier-Free Guidance application interval (end is fixed at 1.0).")
|
327 |
+
with gr.Row():
|
328 |
+
t2i_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="global", label="CFG Renorm Type", info="Normalization type for CFG. Use 'global' if the generated image is blurry.")
|
329 |
+
t2i_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization (1.0 disables CFG-Renorm).")
|
330 |
+
with gr.Row():
|
331 |
+
t2i_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image generation.")
|
332 |
+
t2i_timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details.")
|
333 |
|
334 |
+
# Image Edit Hyperparameters
|
335 |
+
edit_params_accordion = gr.Accordion("Image Edit Specific Parameters", open=False)
|
336 |
+
with edit_params_accordion:
|
337 |
+
gr.Markdown("#### Image Edit Parameters")
|
338 |
with gr.Row():
|
339 |
+
edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt for editing.")
|
340 |
+
edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, label="CFG Image Scale", info="Controls how much the model preserves input image details during editing.")
|
341 |
+
with gr.Row():
|
342 |
+
edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Interval", info="Start of CFG application interval for editing (end is fixed at 1.0).")
|
343 |
+
edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="text_channel", label="CFG Renorm Type", info="Normalization type for CFG during editing. Use 'global' if output is blurry.")
|
344 |
+
with gr.Row():
|
345 |
+
edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization during editing (1.0 disables CFG-Renorm).")
|
|
|
|
|
|
|
|
|
346 |
with gr.Row():
|
347 |
+
edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image editing.")
|
348 |
+
edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details during editing.")
|
349 |
+
|
350 |
+
# Main chat processing function
|
351 |
+
@spaces.GPU(duration=90) # Apply GPU decorator to the combined function
|
352 |
+
def process_chat_message(history, prompt, uploaded_image, mode,
|
353 |
+
show_thinking_global_val, seed_global_val,
|
354 |
+
common_do_sample_val, common_text_temperature_val, common_max_think_token_n_val,
|
355 |
+
t2i_cfg_text_scale_val, t2i_cfg_interval_val, t2i_timestep_shift_val,
|
356 |
+
t2i_num_timesteps_val, t2i_cfg_renorm_min_val, t2i_cfg_renorm_type_val,
|
357 |
+
t2i_image_ratio_val,
|
358 |
+
edit_cfg_text_scale_val, edit_cfg_img_scale_val, edit_cfg_interval_val,
|
359 |
+
edit_timestep_shift_val, edit_num_timesteps_val, edit_cfg_renorm_min_val,
|
360 |
+
edit_cfg_renorm_type_val):
|
361 |
+
|
362 |
+
# Append user message to history
|
363 |
+
history.append([prompt, None])
|
364 |
|
365 |
+
# Define common parameters for inference functions
|
366 |
+
common_infer_params = dict(
|
367 |
+
show_thinking=show_thinking_global_val,
|
368 |
+
do_sample=common_do_sample_val,
|
369 |
+
text_temperature=common_text_temperature_val,
|
370 |
+
)
|
371 |
+
|
372 |
+
try:
|
373 |
+
if mode == "Text to Image":
|
374 |
+
# Add T2I specific parameters, including max_think_token_n and seed
|
375 |
+
t2i_params = {
|
376 |
+
**common_infer_params,
|
377 |
+
"max_think_token_n": common_max_think_token_n_val,
|
378 |
+
"seed": seed_global_val,
|
379 |
+
"cfg_text_scale": t2i_cfg_text_scale_val,
|
380 |
+
"cfg_interval": t2i_cfg_interval_val,
|
381 |
+
"timestep_shift": t2i_timestep_shift_val,
|
382 |
+
"num_timesteps": t2i_num_timesteps_val,
|
383 |
+
"cfg_renorm_min": t2i_cfg_renorm_min_val,
|
384 |
+
"cfg_renorm_type": t2i_cfg_renorm_type_val,
|
385 |
+
"image_ratio": t2i_image_ratio_val,
|
386 |
+
}
|
387 |
|
388 |
+
for img, txt in text_to_image(
|
389 |
+
prompt=prompt,
|
390 |
+
**t2i_params
|
391 |
+
):
|
392 |
+
# For Text to Image, yield image first, then thinking text (if available)
|
393 |
+
if img is not None:
|
394 |
+
history[-1] = [prompt, (img, txt)]
|
395 |
+
elif txt: # Only update text if image is not ready yet
|
396 |
+
history[-1] = [prompt, txt]
|
397 |
+
yield history, gr.update(value="") # Update chatbot and clear input
|
398 |
+
|
399 |
+
elif mode == "Image Understanding":
|
400 |
+
if uploaded_image is None:
|
401 |
+
history[-1] = [prompt, "Please upload an image for Image Understanding."]
|
402 |
+
yield history, gr.update(value="")
|
403 |
+
return
|
404 |
|
405 |
+
# Add Understanding specific parameters (max_new_tokens maps to common_max_think_token_n)
|
406 |
+
# Note: seed is not used in image_understanding
|
407 |
+
understand_params = {
|
408 |
+
**common_infer_params,
|
409 |
+
"max_new_tokens": common_max_think_token_n_val,
|
410 |
+
}
|
411 |
+
# Remove seed from parameters as it's not used by image_understanding
|
412 |
+
understand_params.pop('seed', None)
|
413 |
|
414 |
+
for txt in image_understanding(
|
415 |
+
image=uploaded_image,
|
416 |
+
prompt=prompt,
|
417 |
+
**understand_params
|
418 |
+
):
|
419 |
+
history[-1] = [prompt, txt]
|
420 |
+
yield history, gr.update(value="")
|
421 |
+
|
422 |
+
elif mode == "Image Edit":
|
423 |
+
if uploaded_image is None:
|
424 |
+
history[-1] = [prompt, "Please upload an image for Image Editing."]
|
425 |
+
yield history, gr.update(value="")
|
426 |
+
return
|
427 |
|
428 |
+
# Add Edit specific parameters, including max_think_token_n and seed
|
429 |
+
edit_params = {
|
430 |
+
**common_infer_params,
|
431 |
+
"max_think_token_n": common_max_think_token_n_val,
|
432 |
+
"seed": seed_global_val,
|
433 |
+
"cfg_text_scale": edit_cfg_text_scale_val,
|
434 |
+
"cfg_img_scale": edit_cfg_img_scale_val,
|
435 |
+
"cfg_interval": edit_cfg_interval_val,
|
436 |
+
"timestep_shift": edit_timestep_shift_val,
|
437 |
+
"num_timesteps": edit_num_timesteps_val,
|
438 |
+
"cfg_renorm_min": edit_cfg_renorm_min_val,
|
439 |
+
"cfg_renorm_type": edit_cfg_renorm_type_val,
|
440 |
+
}
|
441 |
+
|
442 |
+
for img, txt in edit_image(
|
443 |
+
image=uploaded_image,
|
444 |
+
prompt=prompt,
|
445 |
+
**edit_params
|
446 |
+
):
|
447 |
+
# For Image Edit, yield image first, then thinking text (if available)
|
448 |
+
if img is not None:
|
449 |
+
history[-1] = [prompt, (img, txt)]
|
450 |
+
elif txt: # Only update text if image is not ready yet
|
451 |
+
history[-1] = [prompt, txt]
|
452 |
+
yield history, gr.update(value="")
|
453 |
+
|
454 |
+
except Exception as e:
|
455 |
+
history[-1] = [prompt, f"An error occurred: {e}"]
|
456 |
+
yield history, gr.update(value="") # Update history with error and clear input
|
457 |
+
|
458 |
+
# Event handlers for dynamic UI updates and submission
|
459 |
+
# Control visibility of thinking parameters
|
460 |
+
show_thinking_global.change(
|
461 |
+
fn=lambda x: gr.update(visible=x),
|
462 |
+
inputs=[show_thinking_global],
|
463 |
+
outputs=[thinking_params_container]
|
464 |
+
)
|
465 |
|
466 |
+
# Clear image input if mode switches to Text to Image
|
467 |
+
mode_selector.change(
|
468 |
+
fn=lambda mode: gr.update(value=None) if mode == "Text to Image" else gr.update(),
|
469 |
+
inputs=[mode_selector],
|
470 |
+
outputs=[image_input]
|
471 |
+
)
|
472 |
+
|
473 |
+
# List of all input components whose values are passed to process_chat_message
|
474 |
+
inputs_list = [
|
475 |
+
chatbot, user_prompt, image_input, mode_selector,
|
476 |
+
show_thinking_global, seed_global,
|
477 |
+
common_do_sample, common_text_temperature, common_max_think_token_n,
|
478 |
+
t2i_cfg_text_scale, t2i_cfg_interval, t2i_timestep_shift,
|
479 |
+
t2i_num_timesteps, t2i_cfg_renorm_min, t2i_cfg_renorm_type,
|
480 |
+
t2i_image_ratio,
|
481 |
+
edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
|
482 |
+
edit_timestep_shift, edit_num_timesteps, edit_cfg_renorm_min,
|
483 |
+
edit_cfg_renorm_type
|
484 |
+
]
|
485 |
+
|
486 |
+
# Link submit button and text input 'Enter' key to the processing function
|
487 |
+
submit_btn.click(
|
488 |
+
fn=process_chat_message,
|
489 |
+
inputs=inputs_list,
|
490 |
+
outputs=[chatbot, user_prompt],
|
491 |
+
scroll_to_output=True,
|
492 |
+
queue=False, # Set to True if long generation times cause issues, but might affect responsiveness
|
493 |
+
)
|
494 |
+
user_prompt.submit( # Allows pressing Enter in textbox to submit
|
495 |
+
fn=process_chat_message,
|
496 |
+
inputs=inputs_list,
|
497 |
+
outputs=[chatbot, user_prompt],
|
498 |
+
scroll_to_output=True,
|
499 |
+
queue=False,
|
500 |
+
)
|
501 |
|
502 |
demo.launch()
|