ILLUME_plus-7b

Running on Zero

App Files Files Community

huangrh9 commited on May 29

Commit

cff4f35

verified ·

1 Parent(s): 83e02a2

Update app.py

Browse files

Files changed (1) hide show

app.py +336 -49

app.py CHANGED Viewed

@@ -1,64 +1,351 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import argparse
+import datetime
+import json
+import os
+import time
+import torch
 import gradio as gr
+from PIL import Image
+from tokenizer.sdxl_decoder_pipe import StableDiffusionXLDecoderPipeline
+from torchvision import transforms
+import logging
+from utils.registry_utils import Config
+from tokenizer.builder import build_vq_model
+from dataset.multi_ratio_dataset import get_image_size, assign_ratio
+def read_config(file):
+    # solve config loading conflict when multi-processes
+    import time
+    while True:
+        config = Config.fromfile(file)
+        if len(config) == 0:
+            time.sleep(0.1)
+            continue
+        break
+    return config
+def build_logger(name, log_file):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    handler = logging.FileHandler(log_file)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    return logger
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+vq_model = None
+is_ema_model = False
+diffusion_pipeline = None
+lazy_load = False
+# diffusion decoder hyperparameters.
+resolution_list = [
+    (1024, 1024), (768, 1024), (1024, 768),
+    (512, 2048), (2048, 512), (640, 1920),
+    (1920, 640), (768, 1536),
+    (1536, 768), (768, 1152), (1152, 768)
+]
+cfg_range = (1, 10.0)
+step_range = (1, 100)
+def resize_to_shortest_edge(img, shortest_edge_resolution):
+    width, height = img.size
+    if width < height:
+        new_width = shortest_edge_resolution
+        new_height = int(height * (new_width / width))
+    elif height < width:
+        new_height = shortest_edge_resolution
+        new_width = int(width * (new_height / height))
+    else:
+        new_width = shortest_edge_resolution
+        new_height = shortest_edge_resolution
+    resized_img = img.resize((new_width, new_height))
+    return resized_img
+from PIL import Image
+def resize_to_square_with_long_edge(image: Image.Image, size: int = 512):
+    """Resize image so that its *long* side equals `size`, short side scaled proportionally."""
+    width, height = image.size
+    if width > height:
+        new_width = size
+        new_height = int(size * height / width)
+    else:
+        new_height = size
+        new_width = int(size * width / height)
+    return image.resize((new_width, new_height), Image.LANCZOS)
+def pad_to_square(image: Image.Image, target_size: int = 512, color=(255, 255, 255)):
+    image = resize_to_square_with_long_edge(image, target_size)
+    new_img = Image.new("RGB", (target_size, target_size), color)
+    offset_x = (target_size - image.width) // 2
+    offset_y = (target_size - image.height) // 2
+    new_img.paste(image, (offset_x, offset_y))
+    return new_img
+def load_vqgan_model(args, model_dtype='fp16', use_ema=False, ):
+    global vq_model
+    vq_model = build_vq_model(args.vq_model)
+    if model_dtype == 'fp16':
+        vq_model = vq_model.to(torch.float16)
+        logger.info("Convert the model dtype to float16")
+    elif model_dtype == 'bf16':
+        vq_model = vq_model.to(torch.bfloat16)
+        logger.info("Convert the model dtype to bfloat16")
+    vq_model.to('cuda')
+    vq_model.eval()
+    checkpoint = torch.load(args.vq_ckpt, map_location="cpu")
+    if "ema" in checkpoint:
+        ema_state_dict = checkpoint["ema"]
+    else:
+        ema_state_dict = None
+    if "model" in checkpoint:
+        model_state_dict = checkpoint["model"]
+    elif "state_dict" in checkpoint:
+        model_state_dict = checkpoint["state_dict"]
+    else:
+        model_state_dict = checkpoint
+    if use_ema:
+        vq_model.load_state_dict(ema_state_dict, strict=True)
+    else:
+        vq_model.load_state_dict(model_state_dict, strict=True)
+    return vq_model
+def load_diffusion_decoder(args):
+    global diffusion_pipeline
+    diffusion_pipeline = StableDiffusionXLDecoderPipeline.from_pretrained(
+        args.sdxl_decoder_path,
+        add_watermarker=False,
+        vq_config=args,
+        vq_model=vq_model,
+    )
+    diffusion_pipeline.to(vq_model.device)
+def vqgan_diffusion_decoder_reconstruct(input_image, diffusion_upsample, cfg_values, steps):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+    input_tensor = transform(input_image).unsqueeze(0).to(vq_model.device)
+    org_width, org_height = input_image.size
+    if diffusion_upsample:
+        width, height = org_width * 2, org_height * 2
+    else:
+        width, height = org_width, org_height
+    print(diffusion_upsample, org_width, org_height, width, height)
+    group_index = assign_ratio(height, width, resolution_list)
+    select_h, select_w = resolution_list[group_index]
+    diffusion_outputs = diffusion_pipeline(
+        images=input_tensor,
+        height=select_h,
+        width=select_w,
+        guidance_scale=cfg_values,
+        num_inference_steps=steps
+    )
+    sample = diffusion_outputs.images[0]
+    sample.resize((width, height))
+    return sample, f"�� **Output Resolution**: {width}x{height}"
+@torch.no_grad()
+def vqgan_reconstruct(input_image):
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+    org_width, org_height = input_image.size
+    width = org_width // 16 * 16
+    height = org_height // 16 * 16
+    input_image = input_image.resize((width, height))
+    input_tensor = transform(input_image).unsqueeze(0).to(vq_model.device)
+    with torch.no_grad():
+        inputs = vq_model.get_input(dict(image=input_tensor))
+        (quant_semantic, _, _, _), \
+        (quant_detail, _, _) = vq_model.encode(**inputs)
+        reconstructed_image = vq_model.decode(quant_semantic, quant_detail)
+    reconstructed_image = torch.clamp(127.5 * reconstructed_image + 128.0, 0, 255)
+    reconstructed_image = reconstructed_image.squeeze(0).permute(1, 2, 0).cpu().numpy().astype('uint8')
+    output_image = Image.fromarray(reconstructed_image)
+    output_image.resize((org_width, org_height))
+    return output_image, f"�� **Output Resolution**: {org_width}x{org_height}"
+title_markdown = '''# DualViTok Demo
+The DualViTok is a dual-branch vision tokenizer designed to capture both deep semantics and fine-grained textures. Implementation details can be found in ILLUME+[[ArXiv](https://arxiv.org/abs/2504.01934)].
+'''
+usage_markdown = """
+<details>
+<summary><strong>�� Usage Instructions (click to expand)</strong></summary>
+1. Upload an image and click the <strong>Reconstruct</strong> button.
+2. Set <code>Max Shortest Side</code> to limit the image resolution.
+3. Click <code>Force Upscale to Max Shortest Side to enable <strong>Force Upscale</strong> to resize the shortest side of the image to the <code>Max Shortest Side</code>.
+4. <em>(Optional)</em> Check <code>Use EMA model</code> to use the EMA checkpoint for reconstruction.
+5. <em>(Optional)</em> Click <code>Load Diffusion Decoder</code> to enable Diffusion Model decoding.
+   You can also enable <code>2x Upsample</code> to apply super-resolution to the uploaded image.
+</details>
 """
+def build_gradio_interface(args):
+    if not lazy_load:
+        load_vqgan_model(args, model_dtype=args.model_dtype)
+    with gr.Blocks() as demo:
+        gr.Markdown(title_markdown)
+        gr.Markdown(usage_markdown)
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("## ��️ Input Image")
+                input_image = gr.Image(type="pil", label="Upload Image", width=384, height=384)
+                input_resolution_display = gr.Markdown("")
+                gr.Examples(
+                    examples=[
+                        ["../configs/data_configs/test_data_examples/ImageUnderstandingExample/images/1.png",],
+                        ["../configs/data_configs/test_data_examples/ImageUnderstandingExample/images/2.png",],
+                        ["../configs/data_configs/test_data_examples/ImageUnderstandingExample/images/3.png",],
+                    ],
+                    inputs=input_image,
+                    label="Example Images",
+                )
+            with gr.Column():
+                gr.Markdown("## �� Reconstructed Image")
+                output_image_recon = gr.Image(type="pil", label="Reconstruction", width=384, height=384)
+                output_resolution_display = gr.Markdown("")
+            with gr.Column():
+                gr.Markdown("## ⚙ Hyperparameters")
+                # with gr.Row():
+                short_resolution_dropdown = gr.Dropdown(
+                    choices=[None, 256, 384, 512, 1024],
+                    value=1024,
+                    label="Max Shortest Side"
+                )
+                force_upscale_checkbox = gr.Checkbox(label="Force Upscale to Max Shortest Side", value=False)
+                use_ema_checkbox = gr.Checkbox(label="Use EMA Model", value=False)
+                with gr.Accordion("�� Use Diffusion Decoder", open=False):
+                    use_diffusion_checkbox = gr.Checkbox(label="Load Diffusion Decoder", value=False)
+                    diffusion_upsample_checkbox = gr.Checkbox(label="Enable 2x Upsample", value=False)
+                    cfg_slider = gr.Slider(
+                        minimum=cfg_range[0], maximum=cfg_range[1],
+                        step=0.5, value=1.5,
+                        label="CFG Value"
+                    )
+                    step_slider = gr.Slider(
+                        minimum=step_range[0], maximum=step_range[1],
+                        step=1, value=20,
+                        label="Inference Steps"
+                    )
+                reconstruct_btn = gr.Button("�� Reconstruct", variant="primary")
+        def handle_input_image(image):
+            if image is not None:
+                image = image.convert("RGB")
+                w, h = image.size
+                return image, f"�� **Input Resolution**: {w}x{h}"
+            return None, ""
+        input_image.change(
+            handle_input_image,
+            inputs=input_image,
+            outputs=[input_image, input_resolution_display]
+        )
+        def reconstruct_fn(image, use_ema_flag, short_edge_resolution, force_upscale,
+                           use_diffusion_flag, diffusion_upsample, cfg_value, num_steps):
+            if short_edge_resolution is not None:
+                if force_upscale or min(image.size) > short_edge_resolution:
+                    image = resize_to_shortest_edge(image, int(short_edge_resolution))
+            global vq_model
+            if lazy_load and vq_model is None:
+                load_vqgan_model(args, model_dtype=args.model_dtype)
+            if use_ema_flag:
+                if not is_ema_model:
+                    load_vqgan_model(args, model_dtype=args.model_dtype, use_ema=True)
+                    logger.info("Switched to EMA checkpoint")
+            else:
+                if is_ema_model:
+                    load_vqgan_model(args, model_dtype=args.model_dtype, use_ema=False)
+                    logger.info("Switched to non-EMA checkpoint")
+            if use_diffusion_flag:
+                if diffusion_pipeline is None:
+                    load_diffusion_decoder(args)
+                recon_image, resolution_str = vqgan_diffusion_decoder_reconstruct(image, diffusion_upsample, cfg_value,
+                                                                                  num_steps)
+            else:
+                recon_image, resolution_str = vqgan_reconstruct(image)
+            return pad_to_square(recon_image, target_size=384), resolution_str
+        reconstruct_btn.click(
+            reconstruct_fn,
+            inputs=[input_image, use_ema_checkbox, short_resolution_dropdown, force_upscale_checkbox,
+                    use_diffusion_checkbox, diffusion_upsample_checkbox, cfg_slider, step_slider],
+            outputs=[output_image_recon, output_resolution_display])
+    demo.launch(server_name='0.0.0.0')
+# 主函数
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", type=str)
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument("--vq-ckpt", type=str, help="ckpt path for vq model")
+    parser.add_argument("--torch-dtype", type=str, default='fp32')
+    parser.add_argument("--model-dtype", type=str, default='fp32')
+    parser.add_argument("--sdxl-decoder-path", type=str, default=None)
+    parser.add_argument("--verbose", action='store_true')
+    args = parser.parse_args()
+    config = read_config(args.config)
+    config.vq_ckpt = args.vq_ckpt
+    config.torch_dtype = args.torch_dtype
+    config.model_dtype = args.model_dtype
+    config.verbose = args.verbose
+    config.sdxl_decoder_path = args.sdxl_decoder_path
+    build_gradio_interface(config)
 if __name__ == "__main__":
+    main()