Spaces:

ILLUME-MLLM
/

ILLUME_plus-3b

Sleeping

File size: 37,175 Bytes

import argparse
import os
import traceback
import logging
from functools import partial
from threading import Thread

import re  # Added for parsing image tokens

import torch

from transformers import TextIteratorStreamer

from transformers import AutoModel, AutoProcessor
from PIL import Image

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
logging.getLogger("http").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

import gradio as gr
import spaces

from conversation import default_conversation, conv_templates, SeparatorStyle

# --- Global Variables and Model Loading ---
model = None  # Global variable to hold the loaded ILLUME model
args = None  # Global variable to hold command line args
streamer = None  # Global variable to hold command line args

DEFAULT_IMAGE_TOKEN = '<image>'

# Define common resolutions
DEFAULT_RESOLUTIONS = [
    (256, 256), (512, 512), (384, 640), (640, 384), (512, 384),
    (384, 512), (256, 384), (384, 256), (256, 512), (512, 256)
]

DEFAULT_DIFFUSION_RESOLUTIONS = [
    (512, 512), (1024, 1024), (768, 1280), (1280, 768), (1024, 768),
    (768, 1024), (512, 768), (768, 512), (512, 1024), (1024, 512)
]

conv_templates_version = 'qwen2'


# Adapted from your code
def check_image_token_num(image_embed_inds, token_nums=[81, 256], identifier=""):
    image_embed_inds_out = []
    if len(image_embed_inds) != len(token_nums):
        logging.error(
            f"{identifier} Mismatch between number of image token levels ({len(image_embed_inds)}) and expected token_nums ({len(token_nums)})")
        # Handle error appropriately - maybe return None or raise exception
        return None  # Indicate error

    for level, (embed_inds, token_num) in enumerate(zip(image_embed_inds, token_nums)):
        if not len(embed_inds) == token_num:
            logging.warning(
                f"{identifier} Level {level} embed_inds length {len(embed_inds)} not equal to expected {token_num}! Padding/truncating.")
            if len(embed_inds) > token_num:
                embed_inds = embed_inds[:token_num]
            elif len(embed_inds) == 0:
                # Handle empty case - perhaps fill with a default token?
                logging.warning(f"{identifier} Level {level} embed_inds is empty. Filling with zeros.")
                embed_inds = [0] * token_num  # Or a placeholder token ID
            else:
                # Pad with the last token ID
                embed_inds.extend([embed_inds[-1]] * (token_num - len(embed_inds)))
        image_embed_inds_out.append(embed_inds)
    return image_embed_inds_out


# Adapted from your code
def pad_sequence(tokenizer, input_ids, batch_first, padding_value):
    # Assuming input_ids is a list of Tensors
    if tokenizer.padding_side == "left":
        input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
    # Manually pad if needed, or use torch utils if input_ids are tensors
    # This assumes input_ids are already tensors
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
    if tokenizer.padding_side == "left":
        input_ids_padded = torch.flip(input_ids_padded, [1])
    return input_ids_padded


# --- Gradio UI Functions ---
no_change_btn = gr.Button()
enable_btn = gr.Button(interactive=True)
disable_btn = gr.Button(interactive=False)
server_error_msg = "**NETWORK ERROR OR SERVER ISSUE. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
server_oom_msg = "**OUT OF GPU MEMORY DETECTED. PLEASE DECREASE THE MAX OUTPUT TOKENS OR IMAGE RESOLUTION AND REGENERATE.**"


def load_demo_refresh_model_list():
    logging.info("load_demo.")
    # Use the conversation template from the loaded model/config
    # Ensure model is loaded before this runs
    if conv_templates_version in conv_templates:
        state = conv_templates[conv_templates_version].copy()
        logging.info(f"Using conversation template: {conv_templates_version}")
    else:
        logging.warning(f"Conversation template '{conv_templates_version}' not found. Using default.")
        # Find a default template name from conv_templates or define one
        default_template_name = next(iter(conv_templates))  # Get the first available template
        state = conv_templates[default_template_name].copy()
    return state


def regenerate(state):  # Added resolution_wh
    logging.info("regenerate.")
    if not state.messages or len(state.messages) < 2:
        return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 2  # Use state's image

    # Clear the last assistant message
    state.messages[-1][-1] = None

    state.skip_next = False
    # Return state, updated chatbot display, refill textbox, keep image
    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 2


def http_bot_conditional_then(state, temperature, top_k, top_p,
                              image_gen_temperature, image_gen_top_k, image_gen_top_p, max_output_tokens,
                              llm_cfg_scale, resolution_wh, use_diffusion, diffusion_cfg_scale,
                              diffusion_num_inference_steps):
    if state.mode == 'chat':
        result = yield from http_chat_bot(state, temperature, top_k, top_p, max_output_tokens)
    else:
        # result = yield from http_gen_edit_bot(state, temperature, top_k, top_p, max_output_tokens,
        result = yield from http_gen_edit_bot(
            state, temperature, top_k, top_p, image_gen_temperature, image_gen_top_k, image_gen_top_p,
            max_output_tokens,
            llm_cfg_scale, resolution_wh, use_diffusion, diffusion_cfg_scale, diffusion_num_inference_steps)
    return result


def clear_history():
    logging.info("clear_history.")
    state = load_demo_refresh_model_list()
    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 2


def add_text(state, text, image, mode):
    global model  # Ensure we use the loaded model

    logging.info(f"add_text. Text len: {len(text)}, Image provided: {image is not None}")
    if len(text.strip()) == 0 and image is None:
        state.skip_next = True
        # Keep image in the imagebox if only image was present
        return (state, state.to_gradio_chatbot(), "", image) + (no_change_btn,) * 2

    if state.messages and state.messages[-1][1] and \
            isinstance(state.messages[-1][1], str) and state.messages[-1][1].startswith("**"):
        state = load_demo_refresh_model_list()  # Start fresh after error

    if mode == 'image-generation':
        state = load_demo_refresh_model_list()

    image_process_mode = "Default"

    if image is not None:
        if state.get_images():
            state = load_demo_refresh_model_list()

        if '<image>' not in text:
            text = f'<image>\n{text}'
        text = (text, image, image_process_mode)

    # Append user message
    state.append_message(state.roles[0], text)
    state.append_message(state.roles[1], None)  # Placeholder for assistant
    state.skip_next = False
    state.mode = mode
    logging.info(f"Updated state messages: {len(state.messages)}")

    # Return new state, updated chatbot, clear textbox, clear imagebox
    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 2


def stream_response(model, inputs, streamer, prompt, gen_kwargs):
    thread = Thread(target=model.generate, kwargs=dict(
        streamer=streamer,
        **inputs,
        **gen_kwargs
    ))
    thread.start()

    generated_text = prompt

    for new_text in streamer:
        generated_text += new_text
        yield generated_text


@spaces.GPU
def http_chat_bot(state, temperature, top_k, top_p, max_new_tokens):
    global model, args, streamer  # Use global model and args
    logging.info("http_chat_bot.")

    if state.skip_next:
        logging.warning("Skipping bot generation. skip_next or model not ready.")
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
        return

    if len(state.messages) < 2:
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
        return

    # --- Prepare Inputs for ILLUME ---
    # Get the full prompt from the conversation state
    prompt = state.get_prompt()
    all_images = state.get_images(return_pil=True)

    logging.info(f"Raw Prompt: {prompt}")

    inputs = dict(
        text=prompt,
    )
    # Tokenize the prompt
    # run processors
    inputs = processor(**inputs, return_tensors="pt")
    inputs = inputs.to(model.device)

    # avoid mismatch resolution. process the images alone
    if len(all_images):
        images = []
        for image in all_images:
            images.append(processor.image_processor(image, return_tensors="pt")['pixel_values'].to(model.device))
        pixel_values = images
        inputs['pixel_values'] = pixel_values

    logging.info(f"Input IDs shape: {inputs.input_ids.shape}")

    # Set initial response placeholder
    state.messages[-1][-1] = "▌"
    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2

    # --- MLLM Generation ---'
    gen_kwargs = dict(
        pad_token_id=processor.tokenizer.pad_token_id,
        do_sample=True if temperature > 0 else False,  # Controlled by dynamic sampler now, but keep flag
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_new_tokens=max_new_tokens,
        use_cache=True,
        eos_token_id=processor.tokenizer.eos_token_id  # Ensure EOS token is set
    )
    logging.info(f"==== request kwargs====\n{gen_kwargs}")

    if max_new_tokens < 1:
        state.messages[-1][-1] = "Exceeds max token length. Please start a new conversation, thanks."
        yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2
        return

    state.messages[-1][-1] = "▌"
    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2

    # Stream output
    try:
        for generated_text in stream_response(model, inputs, streamer, prompt, gen_kwargs):
            output = generated_text[len(prompt):].strip()
            state.messages[-1][-1] = output
            yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
    except Exception as e:
        os.system("nvidia-smi")
        logging.info(traceback.print_exc())
        state.messages[-1][-1] = server_error_msg
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
    return (state, state.to_gradio_chatbot()) + (enable_btn,) * 2



@torch.inference_mode()
@spaces.GPU(duration=120)  # Specify a duration to avoid timeout
def http_gen_edit_bot(state, temperature, top_k, top_p, image_gen_temperature,
                      image_gen_top_k, image_gen_top_p, max_output_tokens,
                      llm_cfg_scale, resolution_wh, use_diffusion, diffusion_cfg_scale, diffusion_num_inference_steps):
    global model, args  # Use global model and args
    logging.info("http_gen_edit_bot.")

    if state.skip_next:
        logging.warning("Skipping bot generation. skip_next or model not ready.")
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
        return

    if len(state.messages) < 2:
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
        return

    # --- Prepare Inputs for ILLUME ---
    # Get the full prompt from the conversation state
    all_images = state.get_images(return_pil=True)

    # read resolution from user defined.
    h_str, w_str = resolution_wh.split('x')
    h_out, w_out = int(h_str), int(w_str)

    if use_diffusion:
        h_out, w_out = (h_out // 2, w_out // 2)
    else:
        h_out, w_out = (h_out, w_out)
    ratio_tag = f"<height_{h_out}><width_{w_out}>"

    input_state = state.copy()

    # prepare the text.
    original_image_sizes = None
    if len(all_images):
        # image editing.
        original_image_sizes = [image.size for image in all_images]
        logging.info(f"original_image_sizes: {original_image_sizes}")

        all_images = [processor.transform_image_nearest_resolution_ratio(image) for image in all_images]

        inputs = dict(
            images=all_images
        )

        image_inputs = processor.image_processor(**inputs, return_tensors="pt")
        image_inputs = image_inputs.to(model.device)

        # overwrite the output resolution
        h, w = image_inputs['pixel_values'].shape[-2:]
        ratio_tag = f"<height_{h}><width_{w}>"
        h_out, w_out = h, w

        unconditional_text = f"{ratio_tag}{DEFAULT_IMAGE_TOKEN}\nReconstruct the image according to the given image\n"  # of {ratio_tag}

        instruction, img, image_process_type = input_state.messages[-2][-1]
        instruction = instruction.replace(DEFAULT_IMAGE_TOKEN, '').strip()
        text = f"{ratio_tag}{DEFAULT_IMAGE_TOKEN}\nPlease edit the image according to the instruction: {instruction}\n"
        input_state.messages[-2][-1] = text, img, image_process_type

    else:
        # image generation
        unconditional_text = f"Generate a random image of {ratio_tag}"

        text = input_state.messages[-2][-1]
        logging.info(f"Current text is {text}")
        text = f"Generate an image of {ratio_tag}, the content of image is {text}"
        input_state.messages[-2][-1] = text
        logging.info(f"After formating. current text is {text}")
        image_inputs = {}

    # Calculate ratio tag based on base resolution from config
    logging.info(f"Target Resolution: {h_out}x{w_out}, Ratio Tag: {ratio_tag}")
    target_image_resolution = (h_out, w_out)
    prompt = input_state.get_prompt()
    logging.info(f"Raw Prompt: {prompt}")

    # Tokenize the prompt
    inputs = dict(
        text=prompt + ratio_tag,
    )

    inputs = processor(**inputs, return_tensors="pt")
    inputs = inputs.to(model.device)
    inputs.update(image_inputs)

    conv_uncond = conv_templates[conv_templates_version].copy()
    conv_uncond.append_message(conv_uncond.roles[0], unconditional_text)
    conv_uncond.append_message(conv_uncond.roles[1], None)
    unconditional_prompt_str = conv_uncond.get_prompt()  # Add ratio tag

    uncond_inputs = dict(
        text=unconditional_prompt_str + ratio_tag,
        images=all_images
    )

    uncond_inputs = processor(**uncond_inputs, return_tensors="pt")
    uncond_inputs = uncond_inputs.to(model.device)

    logging.info(f"Input IDs shape: {inputs.input_ids.shape}")

    # Set initial response placeholder
    state.messages[-1][-1] = "image generating..."
    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2

    gen_kwargs = dict(
        max_new_tokens=2048,
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
    )

    image_gen_kwargs = dict(
        negative_image_prompt_ids=uncond_inputs.input_ids,
        negative_image_prompt_attention_mask=uncond_inputs.attention_mask,
        target_image_resolution=target_image_resolution,
        guidance_scale=llm_cfg_scale,
        image_semantic_temperature=image_gen_temperature,
        image_semantic_top_k=image_gen_top_k,
        image_semantic_top_p=image_gen_top_p,
        image_pixel_temperature=image_gen_temperature,
        image_pixel_top_k=image_gen_top_k * 3,
        image_pixel_top_p=image_gen_top_p,
    )

    # --- MLLM Generation ---
    generated_image = None
    generated_text = ""
    try:
        from transformers import set_seed
        set_seed(42)
        with torch.inference_mode():  # Ensure no gradients are calculated
            output_ids = model.generate(
                **inputs,
                use_cache=True,
                **gen_kwargs,
                **image_gen_kwargs
            )

            output_ids = output_ids[:, inputs['input_ids'].shape[1]:]

        logging.info(f"Generated output IDs shape: {output_ids.shape}")

        # Decode the generated IDs, skipping prompt and special tokens
        # We need to decode the full output first to parse image tokens
        # output_ids shape is likely (batch_size, seq_len), batch_size=1 here
        generated_ids = output_ids[0]  # Get only generated tokens
        full_output_text = processor.tokenizer.decode(generated_ids, skip_special_tokens=True)
        logging.info(f"Full decoded output: {full_output_text}")

        # --- Parse Output for Image Tokens and Text ---
        # Ensure levels are sorted and create the final list
        generated_text, image_embed_inds_list, list_image_token_parts = processor.parse_text_image(full_output_text,
                                                                                                   DEFAULT_IMAGE_TOKEN)

        assert len(image_embed_inds_list) == 1, 'The number of generated image should be 1.'
        image_embed_inds = image_embed_inds_list[0]
        logging.info(f"The generated text: {full_output_text}")
        logging.info(f"Parsed generated text (image presents as {DEFAULT_IMAGE_TOKEN}): {generated_text}")

        # Update chat with generated text first
        state.messages[-1][-1] = "vision tokenizer decoding..."  # Remove cursor
        yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2  # Yield text update

        # --- Image Detokenization ---
        if any(image_embed_inds):
            logging.info("Image tokens found. Attempting detokenization...")
            yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 2

            samples = processor.decode_images(image_embed_inds_list, target_resolution=target_image_resolution,
                                              use_diffusion=use_diffusion, diffusion_cfg_scale=diffusion_cfg_scale,
                                              diffusion_num_inference_steps=diffusion_num_inference_steps)
            generated_image = samples[0]
            if use_diffusion:
                logging.info(
                    f"Using Diffusion Decoder (cfg: {diffusion_cfg_scale}, steps: {diffusion_num_inference_steps}) Image size: {generated_image.size}")
            else:
                logging.info(f"Using VQ Tokenizer Decoder. Image size: {generated_image.size}")

            if generated_image:
                if original_image_sizes is not None and len(
                        original_image_sizes) == 1:  # editing task, unpad and resize image to original size
                    original_size = original_image_sizes[0]
                    logging.info(f"original size: {original_size}. Output Image size: {generated_image.size}")
                    generated_image = processor.unpad_and_resize_back(generated_image, original_size[0], original_size[1])
                    logging.info(f"final image size: {generated_image.size}")
                logging.info("Image successfully generated.")
                # <image> is placeholder.

            logging.info("Image successfully generated.")
            # <image> is placeholder.
            state.messages[-1][-1] = (DEFAULT_IMAGE_TOKEN, [generated_image], list_image_token_parts)
        else:
            # No image tokens generated
            state.messages[-1][-1] = generated_text  # Final text without image

        # Final yield with potentially updated message (text + image)
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2

    except torch.cuda.OutOfMemoryError as e:
        logging.error(f"CUDA OutOfMemoryError during generation: {e}\n{traceback.format_exc()}")
        state.messages[-1][-1] = server_oom_msg
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2
    except Exception as e:
        logging.error(f"Error during model generation or detokenization: {e}\n{traceback.format_exc()}")
        state.messages[-1][-1] = f"{server_error_msg}\n```\n{traceback.format_exc()}\n```"  # Show traceback in error
        yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 2

    logging.info(f"Final Assistant Message Length: {len(state.messages[-1][-1])}")


def update_resolution_dropdown(diffusion_enabled, current_resolution_str):
    logging.info(f"Updating resolution dropdown. Diffusion: {diffusion_enabled}, Current: {current_resolution_str}")
    current_h_str, current_w_str = current_resolution_str.split('x')
    current_h, current_w = int(current_h_str), int(current_w_str)

    if diffusion_enabled:
        new_h, new_w = int(current_h) * 2, int(current_w) * 2
        if (new_h, new_w) not in DEFAULT_DIFFUSION_RESOLUTIONS:
            new_h, new_w = DEFAULT_DIFFUSION_RESOLUTIONS[0]
        new_value_str = f"{new_h}x{new_w}"
        return gr.Dropdown(
            choices=[f'{h}x{w}' for h, w in DEFAULT_DIFFUSION_RESOLUTIONS],
            value=new_value_str,
            label="Output Resolution (HxW)",
            elem_id="resolution_dropdown",
            info="Select target size for generated images."
        )
    else:
        new_h, new_w = int(current_h) // 2, int(current_w) // 2
        if (new_h, new_w) not in DEFAULT_RESOLUTIONS:
            new_h, new_w = DEFAULT_RESOLUTIONS[0]
        new_value_str = f"{new_h}x{new_w}"
        return gr.Dropdown(
            choices=[f'{h}x{w}' for h, w in DEFAULT_RESOLUTIONS],
            value=new_value_str,
            label="Output Resolution (HxW)",
            elem_id="resolution_dropdown",
            info="Select target size for generated images."
        )


# --- Gradio Layout ---
title_markdown = """
<div style="display: flex; align-items: center; padding: 20px; border-radius: 10px; background-color: #f0f0f0;">
  <div>
    <h1 style="margin: 0;"> ILLUME+: Illuminating Unified MLLM with Dual Visual Tokenization and Diffusion Refinement</h1>
    <h2 style="margin: 10px 0;">
      <a href="https://arxiv.org/abs/2504.01934" target="_blank" rel="noopener noreferrer">Paper</a> |
      <a href="https://github.com/illume-unified-mllm/ILLUME_plus" target="_blank" rel="noopener noreferrer">Code</a> |
      <a href="https://huggingface.co/ILLUME-MLLM/illume_plus-qwen2_5-3b-hf" target="_blank" rel="noopener noreferrer">Model</a> |
      <a href="https://illume-unified-mllm.github.io/" target="_blank" rel="noopener noreferrer">Project Page</a>
    </h2>
    <ul style="margin: 20px 0; padding-left: 20px;">
      <li><strong>1.</strong> Enter text and/or upload an image.</li>
      <li><strong>2.</strong> Click the 💬 <strong>Chat</strong> button for image inputted conversations</li>
      <li><strong>3.</strong> Click the 🖼️ <strong>Generate</strong> for image generation and image editing.</li>
      <li><strong>4.</strong> (Optional) Enable Diffusion Decoder for image super resolution decoding. 
      <li><strong>5.</strong> Adjust generation parameters if needed. 
        <br/><strong>💡 Tip 1:</strong> For better image generation quality, we recommend setting <code>temperature = 1.0</code>, <code>top_k = 2048</code>, <code>top_p = 1.0</code>, <code>llm_cfg = 2.0</code>.    
        <br/><strong>💡 Tip 2:</strong> For better image editing quality, we recommend setting <code>temperature = 0.7</code>, <code>top_k = 512</code>, <code>top_p = 0.8</code>, <code>llm_cfg = 1.5</code>.
        <br/><strong>💡 Tip 3:</strong> For diffusion decoder, CFG scale of 1.5 or 2.0 is enough.
      </li>
    </ul>
  </div>
</div>
"""

learn_more_markdown = ("""
## Citation


    @article{huang2025illume_plus,
      title={ILLUME+: Illuminating Unified MLLM with Dual Visual Tokenization and Diffusion Refinement},
      author={Huang, Runhui and Wang, Chunwei and Yang, Junwei and Lu, Guansong and Yuan, Yunlong and Han, Jianhua and Hou, Lu and Zhang, Wei and Hong, Lanqing and Zhao, Hengshuang and Xu, Hang}
      journal={arXiv preprint arXiv:2504.01934},
      year={2025}
    }
""")

block_css = """
#buttons button {
    min-width: min(120px,100%);
}
.message-row img {
    max-width: 80% !important;
    max-height: 400px !important;
    height: auto !important;
    width: auto !important;
    display: block !important;
    margin-top: 10px !important;
    margin-bottom: 5px !important;
    border-radius: 5px !important;
    border: 1px solid #e0e0e0 !important;
    object-fit: contain !important;
    aspect-ratio: auto !important;
}
.avatar-container img {
    padding: 0px !important;
}
/* Style for resolution dropdown */
#resolution_dropdown .gradio-dropdown {
    min-width: 150px !important;
}
"""

def build_demo(embed_mode):
    textbox = gr.Textbox(label="Text Input / Prompt", show_label=False,
                         placeholder="Enter text prompt. Ask about the image or request image generation...",
                         container=False, scale=8)

    with gr.Blocks(title="ILLUME Demo", theme=gr.themes.Default(), css=block_css) as demo:
        conversation_state = gr.State()  # Holds conversation state (instance of illume.conversation.Conversation)

        if not embed_mode:
            gr.HTML(title_markdown)

        with gr.Row():
            with gr.Column(scale=2):
                imagebox = gr.Image(type="pil", label="Input Image", height=300)

                # Text Generation Parameters
                with gr.Accordion("Text Generation Parameters", open=True):
                    temperature = gr.Slider(
                        minimum=0.0, maximum=1.5, value=1.0, step=0.1,
                        label="Temperature",
                        info="Controls randomness of the output (higher = more diverse)."
                    )
                    top_k = gr.Slider(
                        minimum=1, maximum=4096, value=128, step=1,
                        label="Top-K",
                    )
                    top_p = gr.Slider(
                        minimum=0.1, maximum=1.0, value=1.0, step=0.05,
                        label="Top-P",
                    )
                    max_output_tokens = gr.Slider(
                        minimum=128, maximum=8192, value=1024, step=128,
                        label="Max Output Tokens",
                    )

                # Image Generation Parameters
                with gr.Accordion("Image Generation Parameters", open=True):
                    image_gen_temperature = gr.Slider(
                        minimum=0.0, maximum=1.5, value=1.0, step=0.1,
                        label="Temperature",
                    )
                    image_gen_top_k = gr.Slider(
                        minimum=1, maximum=4096 * 2, value=2048, step=32,
                        label="Top-K",
                        info="Recommended value for better image generation: 2048."
                    )
                    image_gen_top_p = gr.Slider(
                        minimum=0.1, maximum=1.0, value=1.0, step=0.05,
                        label="Top-P",
                    )

                    resolution_wh_dropdown = gr.Dropdown(
                        choices=[f'{h}x{w}' for h, w in DEFAULT_RESOLUTIONS],
                        value="512x512",
                        label="Output Resolution (HxW)",
                        elem_id="resolution_dropdown",
                        info="Select target size for generated images."
                    )

                    llm_cfg_scale = gr.Slider(
                        minimum=1.0, maximum=10.0, value=2.0, step=0.1,
                        label="LLM CFG Scale",
                        info="Guidance for text-to-image conditioning (higher = stricter to prompt)."
                    )

                    with gr.Accordion("Diffusion Decoder (Optional)", open=False):
                        use_diffusion_checkbox = gr.Checkbox(
                            value=False, interactive=True,
                            label="Use diffusion decoder for image generation",
                            info="Enable diffusion decoder."
                        )
                        diffusion_cfg_scale = gr.Slider(
                            minimum=1.0, maximum=15.0, value=2.0, step=0.1,
                            label="Diffusion CFG Scale",
                            info="Guidance strength for diffusion decoder."
                        )
                        diffusion_num_inference_steps = gr.Slider(
                            minimum=5, maximum=100, value=20, step=5,
                            label="Diffusion Inference Steps",
                            info="Number of steps during denoising."
                        )

            with gr.Column(scale=8):
                chatbot = gr.Chatbot(
                    elem_id="chatbot",
                    label="ILLUME Chat",
                    layout="bubble",
                    height=650,  # Increased height
                    bubble_full_width=False,
                    render_markdown=True  # Crucial for images
                )
                with gr.Row():
                    textbox.render()
                with gr.Row(elem_id="buttons") as button_row:
                    chat_btn = gr.Button(value="💬 Chat", variant="primary")
                    gen_btn = gr.Button(value="🖼️ Generate", variant="secondary")
                with gr.Row(elem_id="additional-buttons") as button_row_additional:
                    regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
                    clear_btn = gr.Button(value="🗑️ Clear History", interactive=False)

        # Update examples for ILLUME
        with gr.Accordion("Examples (Click to Load)", open=True):
            with gr.Row():
                gr.Examples(examples=[
                    ["examples/ImageUnderstandingExample/images/1.png",
                     "What are they doing?"],
                    ["examples/ImageUnderstandingExample/images/2.png",
                     "Depict the image in detail."],
                    ["examples/ImageUnderstandingExample/images/3.png",
                     "parse the table"],
                ], inputs=[imagebox, textbox], label='Image Understanding Examples')

                gr.Examples(examples=[
                    [None, "a cat with a hat."],
                    [None, "a smiling child."],
                    [None, "tiger cub playing with soccer ball"],
                    [None, "screenshot from a 16 bit platform game in a lush green landscape"],
                    [None, "Old car in kandy sri lanka,lake road,flower, bright, sunny, orange sky"],
                    [None, "Create a vibrant painting of a tropical beach at sunset."],
                ], inputs=[imagebox, textbox], label='Image Generation Examples')

                gr.Examples(examples=[
                    ["examples/EditingSingleTurnExample/images/0.jpg",
                     "Change the color of the boots to a deep forest green"],
                    ["examples/EditingSingleTurnExample/images/1.jpg",
                     "Add a hat on the dog"],
                    ["examples/EditingSingleTurnExample/images/2.jpg",
                     "Remove the dried flowers"],
                    ["examples/EditingSingleTurnExample/images/3.jpg",
                     "Change it into winter"],
                    ["examples/EditingSingleTurnExample/images/4.jpg",
                     "Delete the tennis racket from the man's hand"],
                    ["examples/EditingSingleTurnExample/images/5.jpg",
                     "Show me this as it would appear in a comic book"],
                ], inputs=[imagebox, textbox], label='Image Editing Examples')

        if not embed_mode:
            gr.Markdown(learn_more_markdown)

        # Register listeners
        btn_list = [regenerate_btn, clear_btn]
        parameter_chat_inputs = [temperature, top_k, top_p, max_output_tokens]
        parameter_gen_edit_inputs = [temperature, top_k, top_p,
                                     image_gen_temperature, image_gen_top_k, image_gen_top_p, max_output_tokens,
                                     llm_cfg_scale, resolution_wh_dropdown,
                                     use_diffusion_checkbox, diffusion_cfg_scale, diffusion_num_inference_steps]

        regenerate_btn.click(
            regenerate,
            [conversation_state],
            [conversation_state, chatbot, textbox, imagebox] + btn_list
        ).then(
            http_bot_conditional_then,
            [conversation_state] + parameter_gen_edit_inputs,  # Pass state and all params
            [conversation_state, chatbot] + btn_list,
        )

        clear_btn.click(
            clear_history,
            None,
            [conversation_state, chatbot, textbox, imagebox] + btn_list,
            queue=False
        )

        # Default use chat.
        textbox.submit(
            partial(add_text, mode="chat"),
            [conversation_state, textbox, imagebox],
            [conversation_state, chatbot, textbox, imagebox] + btn_list,
            queue=False
        ).then(
            http_chat_bot,
            [conversation_state] + parameter_chat_inputs,
            [conversation_state, chatbot] + btn_list,
        )

        # Regular Vision-language Chat
        chat_btn.click(partial(add_text, mode="chat"),
                       [conversation_state, textbox, imagebox],
                       [conversation_state, chatbot, textbox, imagebox] + btn_list,
                       queue=False
                       ).then(
            http_chat_bot,
            [conversation_state] + parameter_chat_inputs,
            [conversation_state, chatbot] + btn_list,
        )

        # Image Generation
        gen_btn.click(
            partial(add_text, mode="image-generation"),
            [conversation_state, textbox, imagebox],
            [conversation_state, chatbot, textbox, imagebox] + btn_list
        ).then(
            http_gen_edit_bot,
            [conversation_state] + parameter_gen_edit_inputs,
            [conversation_state, chatbot] + btn_list
        )

        use_diffusion_checkbox.change(
            fn=update_resolution_dropdown,
            inputs=[use_diffusion_checkbox, resolution_wh_dropdown],
            outputs=resolution_wh_dropdown,
            queue=False
        )

        # Load initial state when demo starts
        demo.load(
            load_demo_refresh_model_list,
            None,
            conversation_state,
            queue=False
        )
    return demo


# --- Main Execution Block ---
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # --- Add arguments for ILLUME configs and checkpoints ---
    parser.add_argument("--model_name", type=str, default="ILLUME-MLLM/illume_plus-qwen2_5-3b-hf",
                        help="Name for builder.")
    parser.add_argument("--torch_dtype", type=str, default='fp32', choices=['fp32', 'bf16', 'fp16'],
                        help="Computation data type.")

    parser.add_argument("--diffusion_decoder_path", type=str, default='ILLUME-MLLM/dualvitok-sdxl-decoder',
                        help="Path to Diffusion Decoder checkpoint. Required if using diffusion.")

    parser.add_argument("--tokenizer_path", type=str, default='ILLUME-MLLM/dualvitok',
                        help="Path to Tokenizer config file (e.g., tokenizer_config.py).")

    # --- End ILLUME arguments ---
    parser.add_argument("--share", action="store_true", help="Create a public Gradio share link")
    parser.add_argument("--embed", action="store_true", help="Run in embed mode (minimal UI)")
    parser.add_argument("--device", type=str, default="cuda", help="Device to run on (cuda, cpu).")

    args = parser.parse_args()

    # --- Model Loading ---
    # --- Model Loading ---set
    # Set device
    device = args.device
    logging.info(f"Using device: {device}")

    args.torch_dtype = dict(fp16=torch.float16, fp32=torch.float32, bf16=torch.bfloat16)[args.torch_dtype]

    # Build the ILLUME model instance
    logging.info("Building ILLUME model...")
    # prepare models and processors
    model = AutoModel.from_pretrained(
        args.model_name,
        torch_dtype=torch.bfloat16,
        # attn_implementation='flash_attention_2',  # OR 'sdpa' for Ascend NPUs
        # torch_dtype=args.torch_dtype,
        attn_implementation='sdpa',  # OR 'sdpa' for Ascend NPUs
        low_cpu_mem_usage=True,
        trust_remote_code=True).eval().to(torch.bfloat16).cuda()
    processor = AutoProcessor.from_pretrained(args.model_name, trust_remote_code=True)

    # set the vision tokenizer for decoding image.
    dualvitok = AutoModel.from_pretrained(args.tokenizer_path,
                                          torch_dtype=torch.float32,
                                          trust_remote_code=True,
                                          ).eval().cuda()
    processor.set_vision_tokenizer(dualvitok)

    # (Optional): set the sdxl diffusion decoder. It will enable upsample 2x image resolution.
    processor.load_diffusion_vision_detokenizer(args.diffusion_decoder_path)

    # Assign device to model for later use
    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)

    logging.info("ILLUME model built successfully.")

    demo = build_demo(args.embed)
    demo.queue(
        max_size=10,
        api_open=False
    ).launch(
        share=args.share,
    )