Spaces:
Running
on
Zero
Running
on
Zero
update to bf16
Browse files- app.py +4 -4
- ip_adapter/custom_pipelines.py +2 -2
- ip_adapter/ip_adapter.py +22 -22
app.py
CHANGED
|
@@ -61,7 +61,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 61 |
adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
|
| 62 |
pipe = StableDiffusionCustomPipeline.from_pretrained(
|
| 63 |
"SG161222/Realistic_Vision_V5.1_noVAE",
|
| 64 |
-
torch_dtype=torch.
|
| 65 |
feature_extractor=None,
|
| 66 |
safety_checker=None
|
| 67 |
)
|
|
@@ -91,7 +91,7 @@ def change_model_fn(model_name: str) -> None:
|
|
| 91 |
pipe = StableDiffusionXLCustomPipeline.from_pretrained(
|
| 92 |
name_mapping[model_name],
|
| 93 |
# variant="fp16",
|
| 94 |
-
torch_dtype=torch.
|
| 95 |
feature_extractor=None
|
| 96 |
)
|
| 97 |
pipeline = ConceptrolIPAdapterPlusXL(pipe, "", adapter_name, device, num_tokens=16)
|
|
@@ -117,7 +117,7 @@ def change_model_fn(model_name: str) -> None:
|
|
| 117 |
adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
|
| 118 |
pipe = StableDiffusionCustomPipeline.from_pretrained(
|
| 119 |
name_mapping[model_name],
|
| 120 |
-
torch_dtype=torch.
|
| 121 |
feature_extractor=None,
|
| 122 |
safety_checker=None
|
| 123 |
)
|
|
@@ -389,4 +389,4 @@ with gr.Blocks(css="style.css") as demo:
|
|
| 389 |
)
|
| 390 |
gr.Markdown(article)
|
| 391 |
|
| 392 |
-
demo.launch(
|
|
|
|
| 61 |
adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
|
| 62 |
pipe = StableDiffusionCustomPipeline.from_pretrained(
|
| 63 |
"SG161222/Realistic_Vision_V5.1_noVAE",
|
| 64 |
+
torch_dtype=torch.bfloat16,
|
| 65 |
feature_extractor=None,
|
| 66 |
safety_checker=None
|
| 67 |
)
|
|
|
|
| 91 |
pipe = StableDiffusionXLCustomPipeline.from_pretrained(
|
| 92 |
name_mapping[model_name],
|
| 93 |
# variant="fp16",
|
| 94 |
+
torch_dtype=torch.bfloat16,
|
| 95 |
feature_extractor=None
|
| 96 |
)
|
| 97 |
pipeline = ConceptrolIPAdapterPlusXL(pipe, "", adapter_name, device, num_tokens=16)
|
|
|
|
| 117 |
adapter_name = "h94/IP-Adapter/models/ip-adapter-plus_sd15.bin"
|
| 118 |
pipe = StableDiffusionCustomPipeline.from_pretrained(
|
| 119 |
name_mapping[model_name],
|
| 120 |
+
torch_dtype=torch.bfloat16,
|
| 121 |
feature_extractor=None,
|
| 122 |
safety_checker=None
|
| 123 |
)
|
|
|
|
| 389 |
)
|
| 390 |
gr.Markdown(article)
|
| 391 |
|
| 392 |
+
demo.launch()
|
ip_adapter/custom_pipelines.py
CHANGED
|
@@ -408,7 +408,7 @@ class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
|
|
| 408 |
if not output_type == "latent":
|
| 409 |
# make sure the VAE is in float32 mode, as it overflows in float16
|
| 410 |
needs_upcasting = (
|
| 411 |
-
self.vae.dtype == torch.
|
| 412 |
)
|
| 413 |
|
| 414 |
if needs_upcasting:
|
|
@@ -423,7 +423,7 @@ class StableDiffusionXLCustomPipeline(StableDiffusionXLPipeline):
|
|
| 423 |
|
| 424 |
# cast back to fp16 if needed
|
| 425 |
if needs_upcasting:
|
| 426 |
-
self.vae.to(dtype=torch.
|
| 427 |
else:
|
| 428 |
image = latents
|
| 429 |
|
|
|
|
| 408 |
if not output_type == "latent":
|
| 409 |
# make sure the VAE is in float32 mode, as it overflows in float16
|
| 410 |
needs_upcasting = (
|
| 411 |
+
self.vae.dtype == torch.bfloat16 and self.vae.config.force_upcast
|
| 412 |
)
|
| 413 |
|
| 414 |
if needs_upcasting:
|
|
|
|
| 423 |
|
| 424 |
# cast back to fp16 if needed
|
| 425 |
if needs_upcasting:
|
| 426 |
+
self.vae.to(dtype=torch.bfloat16)
|
| 427 |
else:
|
| 428 |
image = latents
|
| 429 |
|
ip_adapter/ip_adapter.py
CHANGED
|
@@ -85,8 +85,8 @@ class IPAdapter:
|
|
| 85 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 86 |
"h94/IP-Adapter",
|
| 87 |
subfolder="models/image_encoder",
|
| 88 |
-
torch_dtype=torch.
|
| 89 |
-
).to(self.device, dtype=torch.
|
| 90 |
self.clip_image_processor = CLIPImageProcessor()
|
| 91 |
# image proj model
|
| 92 |
self.image_proj_model = self.init_proj()
|
|
@@ -98,7 +98,7 @@ class IPAdapter:
|
|
| 98 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 99 |
clip_embeddings_dim=self.image_encoder.config.projection_dim,
|
| 100 |
clip_extra_context_tokens=self.num_tokens,
|
| 101 |
-
).to(self.device, dtype=torch.
|
| 102 |
return image_proj_model
|
| 103 |
|
| 104 |
def set_ip_adapter(self):
|
|
@@ -126,7 +126,7 @@ class IPAdapter:
|
|
| 126 |
cross_attention_dim=cross_attention_dim,
|
| 127 |
scale=1.0,
|
| 128 |
num_tokens=self.num_tokens,
|
| 129 |
-
).to(self.device, dtype=torch.
|
| 130 |
unet.set_attn_processor(attn_procs)
|
| 131 |
if hasattr(self.pipe, "controlnet"):
|
| 132 |
if isinstance(self.pipe.controlnet, MultiControlNetModel):
|
|
@@ -167,10 +167,10 @@ class IPAdapter:
|
|
| 167 |
images=pil_image, return_tensors="pt"
|
| 168 |
).pixel_values
|
| 169 |
clip_image_embeds = self.image_encoder(
|
| 170 |
-
clip_image.to(self.device, dtype=torch.
|
| 171 |
).image_embeds
|
| 172 |
else:
|
| 173 |
-
clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.
|
| 174 |
image_prompt_embeds = self.image_proj_model(clip_image_embeds)
|
| 175 |
uncond_image_prompt_embeds = self.image_proj_model(
|
| 176 |
torch.zeros_like(clip_image_embeds)
|
|
@@ -282,8 +282,8 @@ class ConceptrolIPAdapter:
|
|
| 282 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 283 |
"h94/IP-Adapter",
|
| 284 |
subfolder="models/image_encoder",
|
| 285 |
-
torch_dtype=torch.
|
| 286 |
-
).to(self.device, dtype=torch.
|
| 287 |
self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 288 |
self.clip_image_processor = CLIPImageProcessor()
|
| 289 |
# image proj model
|
|
@@ -296,7 +296,7 @@ class ConceptrolIPAdapter:
|
|
| 296 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 297 |
clip_embeddings_dim=self.image_encoder.config.projection_dim,
|
| 298 |
clip_extra_context_tokens=self.num_tokens,
|
| 299 |
-
).to(self.device, dtype=torch.
|
| 300 |
return image_proj_model
|
| 301 |
|
| 302 |
def set_ip_adapter(self, global_masking, adaptive_scale_mask):
|
|
@@ -328,7 +328,7 @@ class ConceptrolIPAdapter:
|
|
| 328 |
global_masking=global_masking,
|
| 329 |
adaptive_scale_mask=adaptive_scale_mask,
|
| 330 |
concept_mask_layer=SD_CONCEPT_LAYER,
|
| 331 |
-
).to(self.device, dtype=torch.
|
| 332 |
unet.set_attn_processor(attn_procs)
|
| 333 |
for name in unet.attn_processors.keys(): # noqa: SIM118
|
| 334 |
cross_attention_dim = (
|
|
@@ -395,10 +395,10 @@ class ConceptrolIPAdapter:
|
|
| 395 |
images=pil_image, return_tensors="pt"
|
| 396 |
).pixel_values
|
| 397 |
clip_image_embeds = self.image_encoder(
|
| 398 |
-
clip_image.to(self.device, dtype=torch.
|
| 399 |
).image_embeds
|
| 400 |
else:
|
| 401 |
-
clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.
|
| 402 |
image_prompt_embeds = self.image_proj_model(clip_image_embeds)
|
| 403 |
uncond_image_prompt_embeds = self.image_proj_model(
|
| 404 |
torch.zeros_like(clip_image_embeds)
|
|
@@ -624,7 +624,7 @@ class ConceptrolIPAdapterXL(ConceptrolIPAdapter):
|
|
| 624 |
global_masking=global_masking,
|
| 625 |
adaptive_scale_mask=adaptive_scale_mask,
|
| 626 |
concept_mask_layer=SDXL_CONCEPT_LAYER,
|
| 627 |
-
).to(self.device, dtype=torch.
|
| 628 |
unet.set_attn_processor(attn_procs)
|
| 629 |
for name in unet.attn_processors.keys(): # noqa: SIM118
|
| 630 |
cross_attention_dim = (
|
|
@@ -743,7 +743,7 @@ class IPAdapterPlus(IPAdapter):
|
|
| 743 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 744 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 745 |
ff_mult=4,
|
| 746 |
-
).to(self.device, dtype=torch.
|
| 747 |
return image_proj_model
|
| 748 |
|
| 749 |
@torch.inference_mode()
|
|
@@ -753,7 +753,7 @@ class IPAdapterPlus(IPAdapter):
|
|
| 753 |
clip_image = self.clip_image_processor(
|
| 754 |
images=pil_image, return_tensors="pt"
|
| 755 |
).pixel_values
|
| 756 |
-
clip_image = clip_image.to(self.device, dtype=torch.
|
| 757 |
clip_image_embeds = self.image_encoder(
|
| 758 |
clip_image, output_hidden_states=True
|
| 759 |
).hidden_states[-2]
|
|
@@ -778,7 +778,7 @@ class ConceptrolIPAdapterPlus(ConceptrolIPAdapter):
|
|
| 778 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 779 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 780 |
ff_mult=4,
|
| 781 |
-
).to(self.device, dtype=torch.
|
| 782 |
return image_proj_model
|
| 783 |
|
| 784 |
@torch.inference_mode()
|
|
@@ -788,7 +788,7 @@ class ConceptrolIPAdapterPlus(ConceptrolIPAdapter):
|
|
| 788 |
clip_image = self.clip_image_processor(
|
| 789 |
images=pil_image, return_tensors="pt"
|
| 790 |
).pixel_values
|
| 791 |
-
clip_image = clip_image.to(self.device, dtype=torch.
|
| 792 |
clip_image_embeds = self.image_encoder(
|
| 793 |
clip_image, output_hidden_states=True
|
| 794 |
).hidden_states[-2]
|
|
@@ -807,7 +807,7 @@ class IPAdapterFull(IPAdapterPlus):
|
|
| 807 |
image_proj_model = MLPProjModel(
|
| 808 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 809 |
clip_embeddings_dim=self.image_encoder.config.hidden_size,
|
| 810 |
-
).to(self.device, dtype=torch.
|
| 811 |
return image_proj_model
|
| 812 |
|
| 813 |
|
|
@@ -824,7 +824,7 @@ class IPAdapterPlusXL(IPAdapter):
|
|
| 824 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 825 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 826 |
ff_mult=4,
|
| 827 |
-
).to(self.device, dtype=torch.
|
| 828 |
return image_proj_model
|
| 829 |
|
| 830 |
@torch.inference_mode()
|
|
@@ -834,7 +834,7 @@ class IPAdapterPlusXL(IPAdapter):
|
|
| 834 |
clip_image = self.clip_image_processor(
|
| 835 |
images=pil_image, return_tensors="pt"
|
| 836 |
).pixel_values
|
| 837 |
-
clip_image = clip_image.to(self.device, dtype=torch.
|
| 838 |
clip_image_embeds = self.image_encoder(
|
| 839 |
clip_image, output_hidden_states=True
|
| 840 |
).hidden_states[-2]
|
|
@@ -937,7 +937,7 @@ class ConceptrolIPAdapterPlusXL(ConceptrolIPAdapterXL):
|
|
| 937 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 938 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 939 |
ff_mult=4,
|
| 940 |
-
).to(self.device, dtype=torch.
|
| 941 |
return image_proj_model
|
| 942 |
|
| 943 |
@torch.inference_mode()
|
|
@@ -947,7 +947,7 @@ class ConceptrolIPAdapterPlusXL(ConceptrolIPAdapterXL):
|
|
| 947 |
clip_image = self.clip_image_processor(
|
| 948 |
images=pil_image, return_tensors="pt"
|
| 949 |
).pixel_values
|
| 950 |
-
clip_image = clip_image.to(self.device, dtype=torch.
|
| 951 |
clip_image_embeds = self.image_encoder(
|
| 952 |
clip_image, output_hidden_states=True
|
| 953 |
).hidden_states[-2]
|
|
|
|
| 85 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 86 |
"h94/IP-Adapter",
|
| 87 |
subfolder="models/image_encoder",
|
| 88 |
+
torch_dtype=torch.bfloat16,
|
| 89 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 90 |
self.clip_image_processor = CLIPImageProcessor()
|
| 91 |
# image proj model
|
| 92 |
self.image_proj_model = self.init_proj()
|
|
|
|
| 98 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 99 |
clip_embeddings_dim=self.image_encoder.config.projection_dim,
|
| 100 |
clip_extra_context_tokens=self.num_tokens,
|
| 101 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 102 |
return image_proj_model
|
| 103 |
|
| 104 |
def set_ip_adapter(self):
|
|
|
|
| 126 |
cross_attention_dim=cross_attention_dim,
|
| 127 |
scale=1.0,
|
| 128 |
num_tokens=self.num_tokens,
|
| 129 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 130 |
unet.set_attn_processor(attn_procs)
|
| 131 |
if hasattr(self.pipe, "controlnet"):
|
| 132 |
if isinstance(self.pipe.controlnet, MultiControlNetModel):
|
|
|
|
| 167 |
images=pil_image, return_tensors="pt"
|
| 168 |
).pixel_values
|
| 169 |
clip_image_embeds = self.image_encoder(
|
| 170 |
+
clip_image.to(self.device, dtype=torch.bfloat16)
|
| 171 |
).image_embeds
|
| 172 |
else:
|
| 173 |
+
clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.bfloat16)
|
| 174 |
image_prompt_embeds = self.image_proj_model(clip_image_embeds)
|
| 175 |
uncond_image_prompt_embeds = self.image_proj_model(
|
| 176 |
torch.zeros_like(clip_image_embeds)
|
|
|
|
| 282 |
self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
| 283 |
"h94/IP-Adapter",
|
| 284 |
subfolder="models/image_encoder",
|
| 285 |
+
torch_dtype=torch.bfloat16,
|
| 286 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 287 |
self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 288 |
self.clip_image_processor = CLIPImageProcessor()
|
| 289 |
# image proj model
|
|
|
|
| 296 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 297 |
clip_embeddings_dim=self.image_encoder.config.projection_dim,
|
| 298 |
clip_extra_context_tokens=self.num_tokens,
|
| 299 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 300 |
return image_proj_model
|
| 301 |
|
| 302 |
def set_ip_adapter(self, global_masking, adaptive_scale_mask):
|
|
|
|
| 328 |
global_masking=global_masking,
|
| 329 |
adaptive_scale_mask=adaptive_scale_mask,
|
| 330 |
concept_mask_layer=SD_CONCEPT_LAYER,
|
| 331 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 332 |
unet.set_attn_processor(attn_procs)
|
| 333 |
for name in unet.attn_processors.keys(): # noqa: SIM118
|
| 334 |
cross_attention_dim = (
|
|
|
|
| 395 |
images=pil_image, return_tensors="pt"
|
| 396 |
).pixel_values
|
| 397 |
clip_image_embeds = self.image_encoder(
|
| 398 |
+
clip_image.to(self.device, dtype=torch.bfloat16)
|
| 399 |
).image_embeds
|
| 400 |
else:
|
| 401 |
+
clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.bfloat16)
|
| 402 |
image_prompt_embeds = self.image_proj_model(clip_image_embeds)
|
| 403 |
uncond_image_prompt_embeds = self.image_proj_model(
|
| 404 |
torch.zeros_like(clip_image_embeds)
|
|
|
|
| 624 |
global_masking=global_masking,
|
| 625 |
adaptive_scale_mask=adaptive_scale_mask,
|
| 626 |
concept_mask_layer=SDXL_CONCEPT_LAYER,
|
| 627 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 628 |
unet.set_attn_processor(attn_procs)
|
| 629 |
for name in unet.attn_processors.keys(): # noqa: SIM118
|
| 630 |
cross_attention_dim = (
|
|
|
|
| 743 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 744 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 745 |
ff_mult=4,
|
| 746 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 747 |
return image_proj_model
|
| 748 |
|
| 749 |
@torch.inference_mode()
|
|
|
|
| 753 |
clip_image = self.clip_image_processor(
|
| 754 |
images=pil_image, return_tensors="pt"
|
| 755 |
).pixel_values
|
| 756 |
+
clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
|
| 757 |
clip_image_embeds = self.image_encoder(
|
| 758 |
clip_image, output_hidden_states=True
|
| 759 |
).hidden_states[-2]
|
|
|
|
| 778 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 779 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 780 |
ff_mult=4,
|
| 781 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 782 |
return image_proj_model
|
| 783 |
|
| 784 |
@torch.inference_mode()
|
|
|
|
| 788 |
clip_image = self.clip_image_processor(
|
| 789 |
images=pil_image, return_tensors="pt"
|
| 790 |
).pixel_values
|
| 791 |
+
clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
|
| 792 |
clip_image_embeds = self.image_encoder(
|
| 793 |
clip_image, output_hidden_states=True
|
| 794 |
).hidden_states[-2]
|
|
|
|
| 807 |
image_proj_model = MLPProjModel(
|
| 808 |
cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
|
| 809 |
clip_embeddings_dim=self.image_encoder.config.hidden_size,
|
| 810 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 811 |
return image_proj_model
|
| 812 |
|
| 813 |
|
|
|
|
| 824 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 825 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 826 |
ff_mult=4,
|
| 827 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 828 |
return image_proj_model
|
| 829 |
|
| 830 |
@torch.inference_mode()
|
|
|
|
| 834 |
clip_image = self.clip_image_processor(
|
| 835 |
images=pil_image, return_tensors="pt"
|
| 836 |
).pixel_values
|
| 837 |
+
clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
|
| 838 |
clip_image_embeds = self.image_encoder(
|
| 839 |
clip_image, output_hidden_states=True
|
| 840 |
).hidden_states[-2]
|
|
|
|
| 937 |
embedding_dim=self.image_encoder.config.hidden_size,
|
| 938 |
output_dim=self.pipe.unet.config.cross_attention_dim,
|
| 939 |
ff_mult=4,
|
| 940 |
+
).to(self.device, dtype=torch.bfloat16)
|
| 941 |
return image_proj_model
|
| 942 |
|
| 943 |
@torch.inference_mode()
|
|
|
|
| 947 |
clip_image = self.clip_image_processor(
|
| 948 |
images=pil_image, return_tensors="pt"
|
| 949 |
).pixel_values
|
| 950 |
+
clip_image = clip_image.to(self.device, dtype=torch.bfloat16)
|
| 951 |
clip_image_embeds = self.image_encoder(
|
| 952 |
clip_image, output_hidden_states=True
|
| 953 |
).hidden_states[-2]
|