VisualCloze

Runtime error

App Files Files Community

lzyhha commited on Apr 10

Commit

b4faa43

1 Parent(s): 391f6e1

test

Browse files

Files changed (3) hide show

app.py +22 -16
demo_tasks/gradio_tasks_unseen.py +1 -1
visualcloze.py +19 -17

app.py CHANGED Viewed

@@ -17,6 +17,10 @@ default_steps = 30
 GUIDANCE = """
 ## 📋 Quick Start Guide:
 1. Adjust **Number of In-context Examples**, 0 disables in-context learning.
 2. Set **Task Columns**, the number of images involved in a task.
@@ -24,16 +28,18 @@ GUIDANCE = """
 4. Click **Generate** to create the images.
 5. Parameters can be fine-tuned under **Advanced Options**.
-<div style='font-size: 20px; color:red;'>🔥 Click the task button in the right bottom to acquire examples of various tasks.</div>
-<div style='font-size: 20px; '> 📧 Need help or have questions? Contact us at: lizhongyu [AT] mail.nankai.edu.cn</div>
-<div style='font-size: 20px;'>
-💻 The runtime on the zero GPU runtime depends on the size of the image grid.
-When generating an image with the resoluation of 1024, the runtime is approximately <span style='font-weight: bold; color:red;'>[45s for a 2x2 grid], [55s for a 2x3 grid], [70s for a 3x3 grid], [90s for a 3x4 grid]</span>.
-When generating three images in a 3x4 grid, i.e., Image to Depth + Normal + Hed, the runtime is approximately <span style='font-weight: bold; color:red;'>110s</span>.
-Deploying locally with an 80G A100 can reduce the runtime by more than half.
-</div>
 """
@@ -90,9 +96,7 @@ def create_demo(model):
                 for i in range(max_grid_h):
                     # Add row label before each row
                     row_texts.append(gr.Markdown(
-                        "<div style='font-size: 24px; font-weight: bold;'>" +
-                        ("query" if i == default_grid_h - 1 else f"In-context Example {i + 1}") +
-                        "</div>",
                         elem_id=f"row_text_{i}",
                         visible=i < default_grid_h
                     ))
@@ -297,9 +301,7 @@ def create_demo(model):
                     gr.update(
                         elem_id=f"row_text_{i}",
                         visible=i < actual_h,
-                        value="<div style='font-size: 24px; font-weight: bold;'>" +
-                        ("Query" if i == actual_h - 1 else f"In-context Example {i + 1}") +
-                        "</div>",
                     )
                 )
@@ -314,6 +316,9 @@ def create_demo(model):
                 images.append([])
                 for j in range(model.grid_w):
                     images[i].append(inputs[i * max_grid_w + j])
             seed, cfg, steps, upsampling_steps, upsampling_noise, layout_text, task_text, content_text = inputs[-8:]
             results = generate(
@@ -489,7 +494,7 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
-    snapshot_download(repo_id="VisualCloze/VisualCloze", repo_type="model", local_dir="models")
     # Initialize model
     model = VisualClozeModel(resolution=args.resolution, model_path=args.model_path, precision=args.precision)
@@ -498,4 +503,5 @@ if __name__ == "__main__":
     demo = create_demo(model)
     # Start Gradio server
-    demo.launch()

 GUIDANCE = """
+## 📧 Contact:
+Need help or have questions? Contact us at: lizhongyu [AT] mail.nankai.edu.cn.
 ## 📋 Quick Start Guide:
 1. Adjust **Number of In-context Examples**, 0 disables in-context learning.
 2. Set **Task Columns**, the number of images involved in a task.
 4. Click **Generate** to create the images.
 5. Parameters can be fine-tuned under **Advanced Options**.
+## 🔥 Task Examples:
+Click the task button in the right bottom to acquire **examples** of various tasks.
+Make sure all images and prompts are loaded before clicking the generate button.
+## 💻 Runtime on the Zero GPU:
+The runtime on the Zero GPU runtime depends on the size of the image grid.
+When generating an image with the resoluation of 1024,
+the runtime is approximately **[45s for a 2x2 grid], [55s for a 2x3 grid], [70s for a 3x3 grid], [90s for a 3x4 grid]**.
+When generating three images in a 3x4 grid, i.e., Image to Depth + Normal + Hed,
+the runtime is approximately **110s**.
+**Deploying locally with an 80G A100 can reduce the runtime by more than half.**
 """
                 for i in range(max_grid_h):
                     # Add row label before each row
                     row_texts.append(gr.Markdown(
+                        "## Query" if i == default_grid_h - 1 else f"## In-context Example {i + 1}",
                         elem_id=f"row_text_{i}",
                         visible=i < default_grid_h
                     ))
                     gr.update(
                         elem_id=f"row_text_{i}",
                         visible=i < actual_h,
+                        value="## Query" if i == actual_h - 1 else f"## In-context Example {i + 1}",
                     )
                 )
                 images.append([])
                 for j in range(model.grid_w):
                     images[i].append(inputs[i * max_grid_w + j])
+                    if i != model.grid_h - 1:
+                        if inputs[i * max_grid_w + j] is None:
+                            raise gr.Error('Please upload in-context examples.')
             seed, cfg, steps, upsampling_steps, upsampling_noise, layout_text, task_text, content_text = inputs[-8:]
             results = generate(
 if __name__ == "__main__":
     args = parse_args()
+    # snapshot_download(repo_id="VisualCloze/VisualCloze", repo_type="model", local_dir="models")
     # Initialize model
     model = VisualClozeModel(resolution=args.resolution, model_path=args.model_path, precision=args.precision)
     demo = create_demo(model)
     # Start Gradio server
+    demo.launch()
+    # demo.launch(share=False, server_port=10050, server_name="0.0.0.0")

demo_tasks/gradio_tasks_unseen.py CHANGED Viewed

@@ -253,7 +253,7 @@ def process_unseen_tasks(x):
             mask = task.get('mask', [0 for _ in range(grid_w - 1)] + [1])
             layout_prompt = get_layout_instruction(grid_w, grid_h)
-            upsampling_noise = None
             steps = None
             outputs = [mask, grid_h, grid_w, layout_prompt, task_prompt, content_prompt, upsampling_noise, steps] + rets
             break

             mask = task.get('mask', [0 for _ in range(grid_w - 1)] + [1])
             layout_prompt = get_layout_instruction(grid_w, grid_h)
+            upsampling_noise = 0.7
             steps = None
             outputs = [mask, grid_h, grid_w, layout_prompt, task_prompt, content_prompt, upsampling_noise, steps] + rets
             break

visualcloze.py CHANGED Viewed

@@ -91,26 +91,26 @@ class VisualClozeModel:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.precision]
-        # Initialize model
-        print("Initializing model...")
-        self.model = load_flow_model(model_name, device=self.device, lora_rank=self.lora_rank)
-        # Initialize VAE
-        print("Initializing VAE...")
-        self.ae = AutoencoderKL.from_pretrained(f"black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=self.dtype).to(self.device)
-        self.ae.requires_grad_(False)
-        # Initialize text encoders
-        print("Initializing text encoders...")
-        self.t5 = load_t5(self.device, max_length=self.max_length)
-        self.clip = load_clip(self.device)
-        self.model.eval().to(self.device, dtype=self.dtype)
-        # Load model weights
-        ckpt = torch.load(model_path)
-        self.model.load_state_dict(ckpt, strict=False)
-        del ckpt
         # Initialize sampler
         transport = create_transport(
@@ -337,6 +337,8 @@ class VisualClozeModel:
                     processed_images.append(blank)
                     if i == grid_h - 1:
                         mask_position.append(1)
         if len(mask_position) > 1 and sum(mask_position) > 1:
             if target_size is None:
@@ -443,7 +445,7 @@ class VisualClozeModel:
                 if True: # images[i] is None:
                     cropped = output_images[-1].crop(((i - row_start) * ret_w // self.grid_w, 0, ((i - row_start) + 1) * ret_w // self.grid_w, ret_h))
                     ret.append(cropped)
-                    if mask_position[i - row_start] and is_upsampling:
                         upsampled = self.upsampling(
                             cropped,
                             upsampling_size,

         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[self.precision]
+        # # Initialize model
+        # print("Initializing model...")
+        # self.model = load_flow_model(model_name, device=self.device, lora_rank=self.lora_rank)
+        # # Initialize VAE
+        # print("Initializing VAE...")
+        # self.ae = AutoencoderKL.from_pretrained(f"black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=self.dtype).to(self.device)
+        # self.ae.requires_grad_(False)
+        # # Initialize text encoders
+        # print("Initializing text encoders...")
+        # self.t5 = load_t5(self.device, max_length=self.max_length)
+        # self.clip = load_clip(self.device)
+        # self.model.eval().to(self.device, dtype=self.dtype)
+        # # Load model weights
+        # ckpt = torch.load(model_path)
+        # self.model.load_state_dict(ckpt, strict=False)
+        # del ckpt
         # Initialize sampler
         transport = create_transport(
                     processed_images.append(blank)
                     if i == grid_h - 1:
                         mask_position.append(1)
+        return processed_images
         if len(mask_position) > 1 and sum(mask_position) > 1:
             if target_size is None:
                 if True: # images[i] is None:
                     cropped = output_images[-1].crop(((i - row_start) * ret_w // self.grid_w, 0, ((i - row_start) + 1) * ret_w // self.grid_w, ret_h))
                     ret.append(cropped)
+                    if mask_position[i - row_start] and is_upsampling and upsampling_noise < 1.0:
                         upsampled = self.upsampling(
                             cropped,
                             upsampling_size,