Kolors-Controlnet_and_IPA

Runtime error

App Files Files Community

lixiang46 commited on Aug 2, 2024

Commit

08f2519

1 Parent(s): 88a3aee

add ipa

Browse files

Files changed (3) hide show

app.py +41 -16
image/bird.png +0 -3
image/dog.png +0 -3

app.py CHANGED Viewed

@@ -23,15 +23,21 @@ device = "cuda"
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
 ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
 ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
 text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
 tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
 controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
 controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
 pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
     vae=vae,
     controlnet = controlnet_depth,
@@ -52,6 +58,14 @@ pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
     force_zeros_for_empty_prompt=False
 )
 @spaces.GPU
 def process_canny_condition(image, canny_threods=[100,200]):
     np_image = image.copy()
@@ -77,6 +91,7 @@ MAX_IMAGE_SIZE = 1024
 @spaces.GPU
 def infer_depth(prompt,
           image = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
           seed = 397886929,
           randomize_seed = False,
@@ -84,19 +99,22 @@ def infer_depth(prompt,
           num_inference_steps = 50,
           controlnet_conditioning_scale = 0.7,
           control_guidance_end = 0.9,
-          strength = 1.0
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     init_image = resize_image(image,  MAX_IMAGE_SIZE)
-    pipe = pipe_depth.to("cuda")
     condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
     image = pipe(
         prompt= prompt ,
         image = init_image,
         controlnet_conditioning_scale = controlnet_conditioning_scale,
         control_guidance_end = control_guidance_end,
         strength= strength ,
         control_image = condi_img,
         negative_prompt= negative_prompt ,
@@ -110,6 +128,7 @@ def infer_depth(prompt,
 @spaces.GPU
 def infer_canny(prompt,
           image = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
           seed = 397886929,
           randomize_seed = False,
@@ -117,19 +136,22 @@ def infer_canny(prompt,
           num_inference_steps = 50,
           controlnet_conditioning_scale = 0.7,
           control_guidance_end = 0.9,
-          strength = 1.0
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     init_image = resize_image(image,  MAX_IMAGE_SIZE)
-    pipe = pipe_canny.to("cuda")
     condi_img = process_canny_condition(np.array(init_image))
     image = pipe(
         prompt= prompt ,
         image = init_image,
         controlnet_conditioning_scale = controlnet_conditioning_scale,
         control_guidance_end = control_guidance_end,
         strength= strength ,
         control_image = condi_img,
         negative_prompt= negative_prompt ,
@@ -141,17 +163,13 @@ def infer_canny(prompt,
     return [condi_img, image], seed
 canny_examples = [
-    ["一个漂亮的女孩，高品质，超清晰，色彩鲜艳，超高分辨率，最佳品质，8k，高清，4K",
-     "image/woman_1.png"],
-    ["全景，一只可爱的白色小狗坐在杯子里，看向镜头，动漫风格，3d渲染，辛烷值渲染",
-    "image/dog.png"]
 ]
 depth_examples = [
-    ["新海诚风格，丰富的色彩，穿着绿色衬衫的女人站在田野里，唯美风景，清新明亮，斑驳的光影，最好的质量，超细节，8K画质",
-     "image/woman_2.png"],
-    ["一只颜色鲜艳的小鸟，高品质，超清晰，色彩鲜艳，超高分辨率，最佳品质，8k，高清，4K",
-     "image/bird.png"]
 ]
 css="""
@@ -239,6 +257,13 @@ with gr.Blocks(css=css) as Kolors:
                         step=0.1,
                         value=1.0,
                     )
             with gr.Row():
                 canny_button = gr.Button("Canny", elem_id="button")
                 depth_button = gr.Button("Depth", elem_id="button")
@@ -251,7 +276,7 @@ with gr.Blocks(css=css) as Kolors:
         gr.Examples(
                 fn = infer_canny,
                 examples = canny_examples,
-                inputs = [prompt, image],
                 outputs = [result, seed_used],
                 label = "Canny"
             )
@@ -259,20 +284,20 @@ with gr.Blocks(css=css) as Kolors:
         gr.Examples(
                 fn = infer_depth,
                 examples = depth_examples,
-                inputs = [prompt, image],
                 outputs = [result, seed_used],
                 label = "Depth"
             )
     canny_button.click(
         fn = infer_canny,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
         outputs = [result, seed_used]
     )
     depth_button.click(
         fn = infer_depth,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
         outputs = [result, seed_used]
     )

 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
 ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
 ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
+ckpt_dir_ipa = snapshot_download(repo_id="Kwai-Kolors/Kolors-IP-Adapter-Plus")
 text_encoder = ChatGLMModel.from_pretrained(f'{ckpt_dir}/text_encoder', torch_dtype=torch.float16).half().to(device)
 tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
 controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
 controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(f'{ckpt_dir_ipa}/weights/Kolors-IP-Adapter-Plus/image_encoder',  ignore_mismatched_sizes=True).to(dtype=torch.float16, device=device)
+ip_img_size = 336
+clip_image_processor = CLIPImageProcessor(size=ip_img_size, crop_size=ip_img_size )
 pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
     vae=vae,
     controlnet = controlnet_depth,
     force_zeros_for_empty_prompt=False
 )
+@spaces.GPU
+def load_ipa(pipe):
+    if hasattr(pipe.unet, 'encoder_hid_proj'):
+        pipe.unet.text_encoder_hid_proj = pipe.unet.encoder_hid_proj
+    pipe.load_ip_adapter( f'{ckpt_dir_ipa}/weights/Kolors-IP-Adapter-Plus' , subfolder="", weight_name=["ip_adapter_plus_general.bin"])
+    return pipe
 @spaces.GPU
 def process_canny_condition(image, canny_threods=[100,200]):
     np_image = image.copy()
 @spaces.GPU
 def infer_depth(prompt,
           image = None,
+          ipa_img = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
           seed = 397886929,
           randomize_seed = False,
           num_inference_steps = 50,
           controlnet_conditioning_scale = 0.7,
           control_guidance_end = 0.9,
+          strength = 1.0,
+          ip_scale = 0.5,
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     init_image = resize_image(image,  MAX_IMAGE_SIZE)
+    pipe = load_ipa(pipe_depth).to("cuda")
+    pipe.set_ip_adapter_scale([ip_scale])
     condi_img = process_depth_condition_midas( np.array(init_image), MAX_IMAGE_SIZE)
     image = pipe(
         prompt= prompt ,
         image = init_image,
         controlnet_conditioning_scale = controlnet_conditioning_scale,
         control_guidance_end = control_guidance_end,
+        ip_adapter_image=[ipa_img],
         strength= strength ,
         control_image = condi_img,
         negative_prompt= negative_prompt ,
 @spaces.GPU
 def infer_canny(prompt,
           image = None,
+          ipa_img = None,
           negative_prompt = "nsfw，脸部阴影，低分辨率，jpeg伪影、模糊、糟糕，黑脸，霓虹灯",
           seed = 397886929,
           randomize_seed = False,
           num_inference_steps = 50,
           controlnet_conditioning_scale = 0.7,
           control_guidance_end = 0.9,
+          strength = 1.0,
+          ip_scale = 0.5,
         ):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed)
     init_image = resize_image(image,  MAX_IMAGE_SIZE)
+    pipe = load_ipa(pipe_canny).to("cuda")
+    pipe.set_ip_adapter_scale([ip_scale])
     condi_img = process_canny_condition(np.array(init_image))
     image = pipe(
         prompt= prompt ,
         image = init_image,
         controlnet_conditioning_scale = controlnet_conditioning_scale,
         control_guidance_end = control_guidance_end,
+        ip_adapter_image=[ipa_img],
         strength= strength ,
         control_image = condi_img,
         negative_prompt= negative_prompt ,
     return [condi_img, image], seed
 canny_examples = [
+    ["一个红色头发的女孩，唯美风景，清新明亮，斑驳的光影，最好的质量，超细节，8K画质",
+     "image/woman_2.png", "image/2.png"],
 ]
 depth_examples = [
+    ["一个漂亮的女孩，最好的质量，超细节，8K画质",
+     "image/1.png","image/woman_1.png"],
 ]
 css="""
                         step=0.1,
                         value=1.0,
                     )
+                    ip_scale = gr.Slider(
+                        label="IP_Scale",
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.5,
+                    )
             with gr.Row():
                 canny_button = gr.Button("Canny", elem_id="button")
                 depth_button = gr.Button("Depth", elem_id="button")
         gr.Examples(
                 fn = infer_canny,
                 examples = canny_examples,
+                inputs = [prompt, image, ipa_image],
                 outputs = [result, seed_used],
                 label = "Canny"
             )
         gr.Examples(
                 fn = infer_depth,
                 examples = depth_examples,
+                inputs = [prompt, image, ipa_image],
                 outputs = [result, seed_used],
                 label = "Depth"
             )
     canny_button.click(
         fn = infer_canny,
+        inputs = [prompt, image, ipa_image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength, ip_scale],
         outputs = [result, seed_used]
     )
     depth_button.click(
         fn = infer_depth,
+        inputs = [prompt, image, ipa_image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength, ip_scale],
         outputs = [result, seed_used]
     )

image/bird.png DELETED Viewed

Git LFS Details

SHA256: e74821365819a2141455e85d5a1c4fa443167dc707e296059c6f4a9d3d93b2f5
Pointer size: 131 Bytes
Size of remote file: 612 kB

image/dog.png DELETED Viewed

Git LFS Details

SHA256: a48c9d517b9a9bd27f31c7fa7e6e4128e27e485168c566dc88db9ece60703338
Pointer size: 132 Bytes
Size of remote file: 1.48 MB