Spaces:

qyoo
/

Conceptrol

Running on Zero

App Files Files Community

qyoo commited on Mar 8

Commit

88c3674

1 Parent(s): 477bc72

update to bf16

Browse files

Files changed (3) hide show

app.py +4 -4
ip_adapter/custom_pipelines.py +2 -2
ip_adapter/ip_adapter.py +22 -22

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
 pipe = StableDiffusionCustomPipeline.from_pretrained(
     "SG161222/Realistic_Vision_V5.1_noVAE",
-    torch_dtype=torch.float16,
     feature_extractor=None,
     safety_checker=None
 )
@@ -91,7 +91,7 @@ def change_model_fn(model_name: str) -> None:
         pipe = StableDiffusionXLCustomPipeline.from_pretrained(
             name_mapping[model_name],
             # variant="fp16",
-            torch_dtype=torch.float16,
             feature_extractor=None
         )
         pipeline = ConceptrolIPAdapterPlusXL(pipe, "", adapter_name, device, num_tokens=16)
@@ -117,7 +117,7 @@ def change_model_fn(model_name: str) -> None:
         adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
         pipe = StableDiffusionCustomPipeline.from_pretrained(
             name_mapping[model_name],
-            torch_dtype=torch.float16,
             feature_extractor=None,
             safety_checker=None
         )
@@ -389,4 +389,4 @@ with gr.Blocks(css="style.css") as demo:
     )
     gr.Markdown(article)
-demo.launch(share=True)

 adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
 pipe = StableDiffusionCustomPipeline.from_pretrained(
     "SG161222/Realistic_Vision_V5.1_noVAE",
+    torch_dtype=torch.bfloat16,
     feature_extractor=None,
     safety_checker=None
 )
         pipe = StableDiffusionXLCustomPipeline.from_pretrained(
             name_mapping[model_name],
             # variant="fp16",
+            torch_dtype=torch.bfloat16,
             feature_extractor=None
         )
         pipeline = ConceptrolIPAdapterPlusXL(pipe, "", adapter_name, device, num_tokens=16)
         adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
         pipe = StableDiffusionCustomPipeline.from_pretrained(
             name_mapping[model_name],
+            torch_dtype=torch.bfloat16,
             feature_extractor=None,
             safety_checker=None
         )
     )
     gr.Markdown(article)
+demo.launch()

ip_adapter/custom_pipelines.py CHANGED Viewed

@@ -408,7 +408,7 @@ class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
             needs_upcasting = (
-                self.vae.dtype == torch.float16 and self.vae.config.force_upcast
             )
             if needs_upcasting:
@@ -423,7 +423,7 @@ class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
             # cast back to fp16 if needed
             if needs_upcasting:
-                self.vae.to(dtype=torch.float16)
         else:
             image = latents

         if not output_type == "latent":
             # make sure the VAE is in float32 mode, as it overflows in float16
             needs_upcasting = (
+                self.vae.dtype == torch.bfloat16 and self.vae.config.force_upcast
             )
             if needs_upcasting:
             # cast back to fp16 if needed
             if needs_upcasting:
+                self.vae.to(dtype=torch.bfloat16)
         else:
             image = latents

ip_adapter/ip_adapter.py CHANGED Viewed

@@ -85,8 +85,8 @@ class IPAdapter:
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "h94/IP-Adapter",
             subfolder="models/image_encoder",
-            torch_dtype=torch.float16,
-        ).to(self.device, dtype=torch.float16)
         self.clip_image_processor = CLIPImageProcessor()
         # image proj model
         self.image_proj_model = self.init_proj()
@@ -98,7 +98,7 @@ class IPAdapter:
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     def set_ip_adapter(self):
@@ -126,7 +126,7 @@ class IPAdapter:
                     cross_attention_dim=cross_attention_dim,
                     scale=1.0,
                     num_tokens=self.num_tokens,
-                ).to(self.device, dtype=torch.float16)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
@@ -167,10 +167,10 @@ class IPAdapter:
                 images=pil_image, return_tensors="pt"
             ).pixel_values
             clip_image_embeds = self.image_encoder(
-                clip_image.to(self.device, dtype=torch.float16)
             ).image_embeds
         else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_image_prompt_embeds = self.image_proj_model(
             torch.zeros_like(clip_image_embeds)
@@ -282,8 +282,8 @@ class ConceptrolIPAdapter:
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "h94/IP-Adapter",
             subfolder="models/image_encoder",
-            torch_dtype=torch.float16,
-        ).to(self.device, dtype=torch.float16)
         self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
         self.clip_image_processor = CLIPImageProcessor()
         # image proj model
@@ -296,7 +296,7 @@ class ConceptrolIPAdapter:
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     def set_ip_adapter(self, global_masking, adaptive_scale_mask):
@@ -328,7 +328,7 @@ class ConceptrolIPAdapter:
                     global_masking=global_masking,
                     adaptive_scale_mask=adaptive_scale_mask,
                     concept_mask_layer=SD_CONCEPT_LAYER,
-                ).to(self.device, dtype=torch.float16)
         unet.set_attn_processor(attn_procs)
         for name in unet.attn_processors.keys():  # noqa: SIM118
             cross_attention_dim = (
@@ -395,10 +395,10 @@ class ConceptrolIPAdapter:
                 images=pil_image, return_tensors="pt"
             ).pixel_values
             clip_image_embeds = self.image_encoder(
-                clip_image.to(self.device, dtype=torch.float16)
             ).image_embeds
         else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_image_prompt_embeds = self.image_proj_model(
             torch.zeros_like(clip_image_embeds)
@@ -624,7 +624,7 @@ class ConceptrolIPAdapterXL(ConceptrolIPAdapter):
                     global_masking=global_masking,
                     adaptive_scale_mask=adaptive_scale_mask,
                     concept_mask_layer=SDXL_CONCEPT_LAYER,
-                ).to(self.device, dtype=torch.float16)
         unet.set_attn_processor(attn_procs)
         for name in unet.attn_processors.keys():  # noqa: SIM118
             cross_attention_dim = (
@@ -743,7 +743,7 @@ class IPAdapterPlus(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -753,7 +753,7 @@ class IPAdapterPlus(IPAdapter):
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
@@ -778,7 +778,7 @@ class ConceptrolIPAdapterPlus(ConceptrolIPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -788,7 +788,7 @@ class ConceptrolIPAdapterPlus(ConceptrolIPAdapter):
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
@@ -807,7 +807,7 @@ class IPAdapterFull(IPAdapterPlus):
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
@@ -824,7 +824,7 @@ class IPAdapterPlusXL(IPAdapter):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -834,7 +834,7 @@ class IPAdapterPlusXL(IPAdapter):
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
@@ -937,7 +937,7 @@ class ConceptrolIPAdapterPlusXL(ConceptrolIPAdapterXL):
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
         return image_proj_model
     @torch.inference_mode()
@@ -947,7 +947,7 @@ class ConceptrolIPAdapterPlusXL(ConceptrolIPAdapterXL):
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]

         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "h94/IP-Adapter",
             subfolder="models/image_encoder",
+            torch_dtype=torch.bfloat16,
+        ).to(self.device, dtype=torch.bfloat16)
         self.clip_image_processor = CLIPImageProcessor()
         # image proj model
         self.image_proj_model = self.init_proj()
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     def set_ip_adapter(self):
                     cross_attention_dim=cross_attention_dim,
                     scale=1.0,
                     num_tokens=self.num_tokens,
+                ).to(self.device, dtype=torch.bfloat16)
         unet.set_attn_processor(attn_procs)
         if hasattr(self.pipe, "controlnet"):
             if isinstance(self.pipe.controlnet, MultiControlNetModel):
                 images=pil_image, return_tensors="pt"
             ).pixel_values
             clip_image_embeds = self.image_encoder(
+                clip_image.to(self.device, dtype=torch.bfloat16)
             ).image_embeds
         else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.bfloat16)
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_image_prompt_embeds = self.image_proj_model(
             torch.zeros_like(clip_image_embeds)
         self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
             "h94/IP-Adapter",
             subfolder="models/image_encoder",
+            torch_dtype=torch.bfloat16,
+        ).to(self.device, dtype=torch.bfloat16)
         self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
         self.clip_image_processor = CLIPImageProcessor()
         # image proj model
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.projection_dim,
             clip_extra_context_tokens=self.num_tokens,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     def set_ip_adapter(self, global_masking, adaptive_scale_mask):
                     global_masking=global_masking,
                     adaptive_scale_mask=adaptive_scale_mask,
                     concept_mask_layer=SD_CONCEPT_LAYER,
+                ).to(self.device, dtype=torch.bfloat16)
         unet.set_attn_processor(attn_procs)
         for name in unet.attn_processors.keys():  # noqa: SIM118
             cross_attention_dim = (
                 images=pil_image, return_tensors="pt"
             ).pixel_values
             clip_image_embeds = self.image_encoder(
+                clip_image.to(self.device, dtype=torch.bfloat16)
             ).image_embeds
         else:
+            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.bfloat16)
         image_prompt_embeds = self.image_proj_model(clip_image_embeds)
         uncond_image_prompt_embeds = self.image_proj_model(
             torch.zeros_like(clip_image_embeds)
                     global_masking=global_masking,
                     adaptive_scale_mask=adaptive_scale_mask,
                     concept_mask_layer=SDXL_CONCEPT_LAYER,
+                ).to(self.device, dtype=torch.bfloat16)
         unet.set_attn_processor(attn_procs)
         for name in unet.attn_processors.keys():  # noqa: SIM118
             cross_attention_dim = (
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     @torch.inference_mode()
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     @torch.inference_mode()
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
         image_proj_model = MLPProjModel(
             cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     @torch.inference_mode()
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]
             embedding_dim=self.image_encoder.config.hidden_size,
             output_dim=self.pipe.unet.config.cross_attention_dim,
             ff_mult=4,
+        ).to(self.device, dtype=torch.bfloat16)
         return image_proj_model
     @torch.inference_mode()
         clip_image = self.clip_image_processor(
             images=pil_image, return_tensors="pt"
         ).pixel_values
+        clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
         clip_image_embeds = self.image_encoder(
             clip_image, output_hidden_states=True
         ).hidden_states[-2]