KingNish commited on
Commit
397bb2f
·
verified ·
1 Parent(s): 9432b24
Files changed (1) hide show
  1. app.py +228 -193
app.py CHANGED
@@ -178,13 +178,13 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
178
  result = {"text": "", "image": None}
179
  # Call inferencer with or without think parameter based on user choice
180
  for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
181
- print(type(i))
182
  if type(i) == str:
183
  result["text"] += i
184
  else:
185
  result["image"] = i
186
 
187
- yield result["image"], result.get("text", None)
188
 
189
 
190
  # Image Understanding function with thinking option and hyperparameters
@@ -192,7 +192,8 @@ def text_to_image(prompt, show_thinking=False, cfg_text_scale=4.0, cfg_interval=
192
  def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
193
  do_sample=False, text_temperature=0.3, max_new_tokens=512):
194
  if image is None:
195
- return "Please upload an image."
 
196
 
197
  if isinstance(image, np.ndarray):
198
  image = Image.fromarray(image)
@@ -203,22 +204,24 @@ def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
203
  inference_hyper = dict(
204
  do_sample=do_sample,
205
  temperature=text_temperature,
206
- max_think_token_n=max_new_tokens, # Set max_length
207
  )
208
 
209
- result = {"text": "", "image": None}
210
  # Use show_thinking parameter to control thinking process
211
  for i in inferencer(image=image, text=prompt, think=show_thinking,
212
  understanding_output=True, **inference_hyper):
213
  if type(i) == str:
214
- result["text"] += i
215
- else:
216
- result["image"] = i
217
- yield result["text"]
 
 
218
 
219
 
220
  # Image Editing function with thinking option and hyperparameters
221
- @spaces.GPU(duration=120)
222
  def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
223
  cfg_img_scale=2.0, cfg_interval=0.0,
224
  timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
@@ -228,7 +231,8 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
228
  set_seed(seed)
229
 
230
  if image is None:
231
- return "Please upload an image.", ""
 
232
 
233
  if isinstance(image, np.ndarray):
234
  image = Image.fromarray(image)
@@ -257,7 +261,7 @@ def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_sc
257
  else:
258
  result["image"] = i
259
 
260
- yield result["image"], result.get("text", "")
261
 
262
  # Helper function to load example images
263
  def load_example_image(image_path):
@@ -267,201 +271,232 @@ def load_example_image(image_path):
267
  print(f"Error loading example image: {e}")
268
  return None
269
 
270
-
271
  # Gradio UI
272
  with gr.Blocks() as demo:
273
  gr.Markdown("""
274
- <div>
275
- <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
276
- </div>
277
- """)
278
-
279
- with gr.Tab("📝 Text to Image"):
280
- txt_input = gr.Textbox(
281
- label="Prompt",
282
- value="A female cosplayer portraying an ethereal fairy or elf, wearing a flowing dress made of delicate fabrics in soft, mystical colors like emerald green and silver. She has pointed ears, a gentle, enchanting expression, and her outfit is adorned with sparkling jewels and intricate patterns. The background is a magical forest with glowing plants, mystical creatures, and a serene atmosphere."
283
- )
 
 
 
284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  with gr.Row():
286
- show_thinking = gr.Checkbox(label="Thinking", value=False)
287
-
288
- # Add hyperparameter controls in an accordion
289
- with gr.Accordion("Inference Hyperparameters", open=False):
290
- # 参数一排两个布局
291
- with gr.Group():
292
- with gr.Row():
293
- seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1,
294
- label="Seed", info="0 for random seed, positive for reproducible results")
295
- image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"],
296
- value="1:1", label="Image Ratio",
297
- info="The longer size is fixed to 1024")
298
-
299
- with gr.Row():
300
- cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
301
- label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0)")
302
- cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1,
303
- label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
304
-
305
- with gr.Row():
306
- cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
307
- value="global", label="CFG Renorm Type",
308
- info="If the genrated image is blurry, use 'global'")
309
- cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
310
- label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
311
-
312
- with gr.Row():
313
- num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
314
- label="Timesteps", info="Total denoising steps")
315
- timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, interactive=True,
316
- label="Timestep Shift", info="Higher values for layout, lower for details")
317
-
318
- # Thinking parameters in a single row
319
- thinking_params = gr.Group(visible=False)
320
- with thinking_params:
321
- with gr.Row():
322
- do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
323
- max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
324
- label="Max Think Tokens", info="Maximum number of tokens for thinking")
325
- text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
326
- label="Temperature", info="Controls randomness in text generation")
327
-
328
- thinking_output = gr.Textbox(label="Thinking Process", visible=False)
329
- img_output = gr.Image(label="Generated Image")
330
- gen_btn = gr.Button("Generate")
331
 
332
- # Dynamically show/hide thinking process box and parameters
333
- def update_thinking_visibility(show):
334
- return gr.update(visible=show), gr.update(visible=show)
335
-
336
- show_thinking.change(
337
- fn=update_thinking_visibility,
338
- inputs=[show_thinking],
339
- outputs=[thinking_output, thinking_params]
340
- )
341
-
342
- gen_btn.click(
343
- fn=text_to_image,
344
- inputs=[
345
- txt_input, show_thinking, cfg_text_scale,
346
- cfg_interval, timestep_shift,
347
- num_timesteps, cfg_renorm_min, cfg_renorm_type,
348
- max_think_token_n, do_sample, text_temperature, seed, image_ratio
349
- ],
350
- outputs=[img_output, thinking_output]
351
- )
 
 
 
 
352
 
353
- with gr.Tab("🖌️ Image Edit"):
 
 
 
354
  with gr.Row():
355
- with gr.Column(scale=1):
356
- edit_image_input = gr.Image(label="Input Image", value=load_example_image('test_images/women.jpg'))
357
- edit_prompt = gr.Textbox(
358
- label="Prompt",
359
- value="She boards a modern subway, quietly reading a folded newspaper, wearing the same clothes."
360
- )
361
-
362
- with gr.Column(scale=1):
363
- edit_image_output = gr.Image(label="Result")
364
- edit_thinking_output = gr.Textbox(label="Thinking Process", visible=False)
365
-
366
  with gr.Row():
367
- edit_show_thinking = gr.Checkbox(label="Thinking", value=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
- # Add hyperparameter controls in an accordion
370
- with gr.Accordion("Inference Hyperparameters", open=False):
371
- with gr.Group():
372
- with gr.Row():
373
- edit_seed = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, interactive=True,
374
- label="Seed", info="0 for random seed, positive for reproducible results")
375
- edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, interactive=True,
376
- label="CFG Text Scale", info="Controls how strongly the model follows the text prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
- with gr.Row():
379
- edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, interactive=True,
380
- label="CFG Image Scale", info="Controls how much the model preserves input image details")
381
- edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
382
- label="CFG Interval", info="Start of CFG application interval (end is fixed at 1.0)")
383
-
384
- with gr.Row():
385
- edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"],
386
- value="text_channel", label="CFG Renorm Type",
387
- info="If the genrated image is blurry, use 'global")
388
- edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, interactive=True,
389
- label="CFG Renorm Min", info="1.0 disables CFG-Renorm")
 
 
 
 
390
 
391
- with gr.Row():
392
- edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, interactive=True,
393
- label="Timesteps", info="Total denoising steps")
394
- edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, interactive=True,
395
- label="Timestep Shift", info="Higher values for layout, lower for details")
 
 
 
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
- # Thinking parameters in a single row
399
- edit_thinking_params = gr.Group(visible=False)
400
- with edit_thinking_params:
401
- with gr.Row():
402
- edit_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
403
- edit_max_think_token_n = gr.Slider(minimum=64, maximum=4006, value=1024, step=64, interactive=True,
404
- label="Max Think Tokens", info="Maximum number of tokens for thinking")
405
- edit_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, interactive=True,
406
- label="Temperature", info="Controls randomness in text generation")
407
-
408
- edit_btn = gr.Button("Submit")
409
-
410
- # Dynamically show/hide thinking process box for editing
411
- def update_edit_thinking_visibility(show):
412
- return gr.update(visible=show), gr.update(visible=show)
413
-
414
- edit_show_thinking.change(
415
- fn=update_edit_thinking_visibility,
416
- inputs=[edit_show_thinking],
417
- outputs=[edit_thinking_output, edit_thinking_params]
418
- )
419
-
420
- edit_btn.click(
421
- fn=edit_image,
422
- inputs=[
423
- edit_image_input, edit_prompt, edit_show_thinking,
424
- edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
425
- edit_timestep_shift, edit_num_timesteps,
426
- edit_cfg_renorm_min, edit_cfg_renorm_type,
427
- edit_max_think_token_n, edit_do_sample, edit_text_temperature, edit_seed
428
- ],
429
- outputs=[edit_image_output, edit_thinking_output]
430
- )
 
 
 
 
431
 
432
- with gr.Tab("🖼️ Image Understanding"):
433
- with gr.Row():
434
- with gr.Column(scale=1):
435
- img_input = gr.Image(label="Input Image", value=load_example_image('test_images/meme.jpg'))
436
- understand_prompt = gr.Textbox(
437
- label="Prompt",
438
- value="Can someone explain what's funny about this meme??"
439
- )
440
-
441
- with gr.Column(scale=1):
442
- txt_output = gr.Textbox(label="Result", lines=20)
443
-
444
- with gr.Row():
445
- understand_show_thinking = gr.Checkbox(label="Thinking", value=False)
446
-
447
- # Add hyperparameter controls in an accordion
448
- with gr.Accordion("Inference Hyperparameters", open=False):
449
- with gr.Row():
450
- understand_do_sample = gr.Checkbox(label="Sampling", value=False, info="Enable sampling for text generation")
451
- understand_text_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.05, interactive=True,
452
- label="Temperature", info="Controls randomness in text generation (0=deterministic, 1=creative)")
453
- understand_max_new_tokens = gr.Slider(minimum=64, maximum=4096, value=512, step=64, interactive=True,
454
- label="Max New Tokens", info="Maximum length of generated text, including potential thinking")
455
-
456
- img_understand_btn = gr.Button("Submit")
457
-
458
- img_understand_btn.click(
459
- fn=image_understanding,
460
- inputs=[
461
- img_input, understand_prompt, understand_show_thinking,
462
- understand_do_sample, understand_text_temperature, understand_max_new_tokens
463
- ],
464
- outputs=txt_output
465
- )
 
466
 
467
  demo.launch()
 
178
  result = {"text": "", "image": None}
179
  # Call inferencer with or without think parameter based on user choice
180
  for i in inferencer(text=prompt, think=show_thinking, understanding_output=False, **inference_hyper):
181
+ # print(type(i)) # For debugging stream
182
  if type(i) == str:
183
  result["text"] += i
184
  else:
185
  result["image"] = i
186
 
187
+ yield result["image"], result.get("text", "")
188
 
189
 
190
  # Image Understanding function with thinking option and hyperparameters
 
192
  def image_understanding(image: Image.Image, prompt: str, show_thinking=False,
193
  do_sample=False, text_temperature=0.3, max_new_tokens=512):
194
  if image is None:
195
+ yield "Please upload an image for understanding."
196
+ return
197
 
198
  if isinstance(image, np.ndarray):
199
  image = Image.fromarray(image)
 
204
  inference_hyper = dict(
205
  do_sample=do_sample,
206
  temperature=text_temperature,
207
+ max_think_token_n=max_new_tokens, # Set max_length for text generation
208
  )
209
 
210
+ result_text = ""
211
  # Use show_thinking parameter to control thinking process
212
  for i in inferencer(image=image, text=prompt, think=show_thinking,
213
  understanding_output=True, **inference_hyper):
214
  if type(i) == str:
215
+ result_text += i
216
+ yield result_text
217
+ # else: This branch seems unused in original, as understanding_output=True typically yields text.
218
+ # If it yielded image, it would be an intermediate. For final output, it's text.
219
+ # For now, we assume it only yields text.
220
+ yield result_text # Ensure final text is yielded
221
 
222
 
223
  # Image Editing function with thinking option and hyperparameters
224
+ @spaces.GPU(duration=90)
225
  def edit_image(image: Image.Image, prompt: str, show_thinking=False, cfg_text_scale=4.0,
226
  cfg_img_scale=2.0, cfg_interval=0.0,
227
  timestep_shift=3.0, num_timesteps=50, cfg_renorm_min=1.0,
 
231
  set_seed(seed)
232
 
233
  if image is None:
234
+ yield None, "Please upload an image for editing." # Yield tuple for image/text
235
+ return
236
 
237
  if isinstance(image, np.ndarray):
238
  image = Image.fromarray(image)
 
261
  else:
262
  result["image"] = i
263
 
264
+ yield result["image"], result.get("text", "") # Yield tuple for image/text
265
 
266
  # Helper function to load example images
267
  def load_example_image(image_path):
 
271
  print(f"Error loading example image: {e}")
272
  return None
273
 
 
274
  # Gradio UI
275
  with gr.Blocks() as demo:
276
  gr.Markdown("""
277
+ <div>
278
+ <img src="https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/banner.png" alt="BAGEL" width="380"/>
279
+ </div>
280
+ # BAGEL Multimodal Chatbot
281
+ Interact with BAGEL to generate images from text, edit existing images, or understand image content.
282
+ """)
283
+
284
+ # Chatbot display area
285
+ chatbot = gr.Chatbot(label="Chat History", height=500, avatar_images=(None, "https://lf3-static.bytednsdoc.com/obj/eden-cn/nuhojubrps/BAGEL_favicon.png"))
286
+
287
+ # Input area
288
+ with gr.Row():
289
+ image_input = gr.Image(type="pil", label="Optional: Upload an Image (for Image Understanding/Edit)", scale=0.5, value=None)
290
 
291
+ with gr.Column(scale=1.5):
292
+ user_prompt = gr.Textbox(label="Your Message", placeholder="Type your prompt here...", lines=3)
293
+
294
+ with gr.Row():
295
+ mode_selector = gr.Radio(
296
+ choices=["Text to Image", "Image Understanding", "Image Edit"],
297
+ value="Text to Image",
298
+ label="Select Mode",
299
+ interactive=True
300
+ )
301
+ submit_btn = gr.Button("Send", variant="primary")
302
+
303
+ # Global/Shared Hyperparameters
304
+ with gr.Accordion("General Settings & Hyperparameters", open=False) as general_accordion:
305
  with gr.Row():
306
+ show_thinking_global = gr.Checkbox(label="Show Thinking Process", value=False, info="Enable to see model's intermediate thinking text.")
307
+ seed_global = gr.Slider(minimum=0, maximum=1000000, value=0, step=1, label="Seed", info="0 for random seed, positive for reproducible results.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
+ # Container for thinking-specific parameters, visibility controlled by show_thinking_global
310
+ thinking_params_container = gr.Group(visible=False)
311
+ with thinking_params_container:
312
+ gr.Markdown("#### Thinking Process Parameters (affect text generation)")
313
+ with gr.Row():
314
+ common_do_sample = gr.Checkbox(label="Enable Sampling", value=False, info="Enable sampling for text generation (otherwise greedy).")
315
+ common_text_temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.3, step=0.1, label="Text Temperature", info="Controls randomness in text generation (higher = more random).")
316
+ common_max_think_token_n = gr.Slider(minimum=64, maximum=4096, value=1024, step=64, label="Max Think Tokens / Max New Tokens", info="Maximum number of tokens for thinking (T2I/Edit) or generated text (Understanding).")
317
+
318
+ # T2I Hyperparameters
319
+ t2i_params_accordion = gr.Accordion("Text to Image Specific Parameters", open=False)
320
+ with t2i_params_accordion:
321
+ gr.Markdown("#### Text to Image Parameters")
322
+ with gr.Row():
323
+ t2i_image_ratio = gr.Dropdown(choices=["1:1", "4:3", "3:4", "16:9", "9:16"], value="1:1", label="Image Ratio", info="The longer size is fixed to 1024 pixels.")
324
+ with gr.Row():
325
+ t2i_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt (4.0-8.0 recommended).")
326
+ t2i_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.4, step=0.1, label="CFG Interval", info="Start of Classifier-Free Guidance application interval (end is fixed at 1.0).")
327
+ with gr.Row():
328
+ t2i_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="global", label="CFG Renorm Type", info="Normalization type for CFG. Use 'global' if the generated image is blurry.")
329
+ t2i_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization (1.0 disables CFG-Renorm).")
330
+ with gr.Row():
331
+ t2i_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image generation.")
332
+ t2i_timestep_shift = gr.Slider(minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details.")
333
 
334
+ # Image Edit Hyperparameters
335
+ edit_params_accordion = gr.Accordion("Image Edit Specific Parameters", open=False)
336
+ with edit_params_accordion:
337
+ gr.Markdown("#### Image Edit Parameters")
338
  with gr.Row():
339
+ edit_cfg_text_scale = gr.Slider(minimum=1.0, maximum=8.0, value=4.0, step=0.1, label="CFG Text Scale", info="Controls how strongly the model follows the text prompt for editing.")
340
+ edit_cfg_img_scale = gr.Slider(minimum=1.0, maximum=4.0, value=2.0, step=0.1, label="CFG Image Scale", info="Controls how much the model preserves input image details during editing.")
341
+ with gr.Row():
342
+ edit_cfg_interval = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Interval", info="Start of CFG application interval for editing (end is fixed at 1.0).")
343
+ edit_cfg_renorm_type = gr.Dropdown(choices=["global", "local", "text_channel"], value="text_channel", label="CFG Renorm Type", info="Normalization type for CFG during editing. Use 'global' if output is blurry.")
344
+ with gr.Row():
345
+ edit_cfg_renorm_min = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.1, label="CFG Renorm Min", info="Minimum value for CFG Renormalization during editing (1.0 disables CFG-Renorm).")
 
 
 
 
346
  with gr.Row():
347
+ edit_num_timesteps = gr.Slider(minimum=10, maximum=100, value=50, step=5, label="Timesteps", info="Total denoising steps for image editing.")
348
+ edit_timestep_shift = gr.Slider(minimum=1.0, maximum=10.0, value=3.0, step=0.5, label="Timestep Shift", info="Higher values for layout control, lower for fine details during editing.")
349
+
350
+ # Main chat processing function
351
+ @spaces.GPU(duration=90) # Apply GPU decorator to the combined function
352
+ def process_chat_message(history, prompt, uploaded_image, mode,
353
+ show_thinking_global_val, seed_global_val,
354
+ common_do_sample_val, common_text_temperature_val, common_max_think_token_n_val,
355
+ t2i_cfg_text_scale_val, t2i_cfg_interval_val, t2i_timestep_shift_val,
356
+ t2i_num_timesteps_val, t2i_cfg_renorm_min_val, t2i_cfg_renorm_type_val,
357
+ t2i_image_ratio_val,
358
+ edit_cfg_text_scale_val, edit_cfg_img_scale_val, edit_cfg_interval_val,
359
+ edit_timestep_shift_val, edit_num_timesteps_val, edit_cfg_renorm_min_val,
360
+ edit_cfg_renorm_type_val):
361
+
362
+ # Append user message to history
363
+ history.append([prompt, None])
364
 
365
+ # Define common parameters for inference functions
366
+ common_infer_params = dict(
367
+ show_thinking=show_thinking_global_val,
368
+ do_sample=common_do_sample_val,
369
+ text_temperature=common_text_temperature_val,
370
+ )
371
+
372
+ try:
373
+ if mode == "Text to Image":
374
+ # Add T2I specific parameters, including max_think_token_n and seed
375
+ t2i_params = {
376
+ **common_infer_params,
377
+ "max_think_token_n": common_max_think_token_n_val,
378
+ "seed": seed_global_val,
379
+ "cfg_text_scale": t2i_cfg_text_scale_val,
380
+ "cfg_interval": t2i_cfg_interval_val,
381
+ "timestep_shift": t2i_timestep_shift_val,
382
+ "num_timesteps": t2i_num_timesteps_val,
383
+ "cfg_renorm_min": t2i_cfg_renorm_min_val,
384
+ "cfg_renorm_type": t2i_cfg_renorm_type_val,
385
+ "image_ratio": t2i_image_ratio_val,
386
+ }
387
 
388
+ for img, txt in text_to_image(
389
+ prompt=prompt,
390
+ **t2i_params
391
+ ):
392
+ # For Text to Image, yield image first, then thinking text (if available)
393
+ if img is not None:
394
+ history[-1] = [prompt, (img, txt)]
395
+ elif txt: # Only update text if image is not ready yet
396
+ history[-1] = [prompt, txt]
397
+ yield history, gr.update(value="") # Update chatbot and clear input
398
+
399
+ elif mode == "Image Understanding":
400
+ if uploaded_image is None:
401
+ history[-1] = [prompt, "Please upload an image for Image Understanding."]
402
+ yield history, gr.update(value="")
403
+ return
404
 
405
+ # Add Understanding specific parameters (max_new_tokens maps to common_max_think_token_n)
406
+ # Note: seed is not used in image_understanding
407
+ understand_params = {
408
+ **common_infer_params,
409
+ "max_new_tokens": common_max_think_token_n_val,
410
+ }
411
+ # Remove seed from parameters as it's not used by image_understanding
412
+ understand_params.pop('seed', None)
413
 
414
+ for txt in image_understanding(
415
+ image=uploaded_image,
416
+ prompt=prompt,
417
+ **understand_params
418
+ ):
419
+ history[-1] = [prompt, txt]
420
+ yield history, gr.update(value="")
421
+
422
+ elif mode == "Image Edit":
423
+ if uploaded_image is None:
424
+ history[-1] = [prompt, "Please upload an image for Image Editing."]
425
+ yield history, gr.update(value="")
426
+ return
427
 
428
+ # Add Edit specific parameters, including max_think_token_n and seed
429
+ edit_params = {
430
+ **common_infer_params,
431
+ "max_think_token_n": common_max_think_token_n_val,
432
+ "seed": seed_global_val,
433
+ "cfg_text_scale": edit_cfg_text_scale_val,
434
+ "cfg_img_scale": edit_cfg_img_scale_val,
435
+ "cfg_interval": edit_cfg_interval_val,
436
+ "timestep_shift": edit_timestep_shift_val,
437
+ "num_timesteps": edit_num_timesteps_val,
438
+ "cfg_renorm_min": edit_cfg_renorm_min_val,
439
+ "cfg_renorm_type": edit_cfg_renorm_type_val,
440
+ }
441
+
442
+ for img, txt in edit_image(
443
+ image=uploaded_image,
444
+ prompt=prompt,
445
+ **edit_params
446
+ ):
447
+ # For Image Edit, yield image first, then thinking text (if available)
448
+ if img is not None:
449
+ history[-1] = [prompt, (img, txt)]
450
+ elif txt: # Only update text if image is not ready yet
451
+ history[-1] = [prompt, txt]
452
+ yield history, gr.update(value="")
453
+
454
+ except Exception as e:
455
+ history[-1] = [prompt, f"An error occurred: {e}"]
456
+ yield history, gr.update(value="") # Update history with error and clear input
457
+
458
+ # Event handlers for dynamic UI updates and submission
459
+ # Control visibility of thinking parameters
460
+ show_thinking_global.change(
461
+ fn=lambda x: gr.update(visible=x),
462
+ inputs=[show_thinking_global],
463
+ outputs=[thinking_params_container]
464
+ )
465
 
466
+ # Clear image input if mode switches to Text to Image
467
+ mode_selector.change(
468
+ fn=lambda mode: gr.update(value=None) if mode == "Text to Image" else gr.update(),
469
+ inputs=[mode_selector],
470
+ outputs=[image_input]
471
+ )
472
+
473
+ # List of all input components whose values are passed to process_chat_message
474
+ inputs_list = [
475
+ chatbot, user_prompt, image_input, mode_selector,
476
+ show_thinking_global, seed_global,
477
+ common_do_sample, common_text_temperature, common_max_think_token_n,
478
+ t2i_cfg_text_scale, t2i_cfg_interval, t2i_timestep_shift,
479
+ t2i_num_timesteps, t2i_cfg_renorm_min, t2i_cfg_renorm_type,
480
+ t2i_image_ratio,
481
+ edit_cfg_text_scale, edit_cfg_img_scale, edit_cfg_interval,
482
+ edit_timestep_shift, edit_num_timesteps, edit_cfg_renorm_min,
483
+ edit_cfg_renorm_type
484
+ ]
485
+
486
+ # Link submit button and text input 'Enter' key to the processing function
487
+ submit_btn.click(
488
+ fn=process_chat_message,
489
+ inputs=inputs_list,
490
+ outputs=[chatbot, user_prompt],
491
+ scroll_to_output=True,
492
+ queue=False, # Set to True if long generation times cause issues, but might affect responsiveness
493
+ )
494
+ user_prompt.submit( # Allows pressing Enter in textbox to submit
495
+ fn=process_chat_message,
496
+ inputs=inputs_list,
497
+ outputs=[chatbot, user_prompt],
498
+ scroll_to_output=True,
499
+ queue=False,
500
+ )
501
 
502
  demo.launch()