Spaces:

JackAILab
/

ConsistentID-SDXL

Running on Zero

App Files Files Community

JackAILab commited on Jan 4

Commit

23d7289

verified ·

1 Parent(s): ba409e9

Update pipline_StableDiffusionXL_ConsistentID.py

Browse files

Files changed (1) hide show

pipline_StableDiffusionXL_ConsistentID.py +27 -6

pipline_StableDiffusionXL_ConsistentID.py CHANGED Viewed

@@ -42,17 +42,37 @@ PipelineImageInput = Union[
 class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
         weight_name: str,
         subfolder: str = '',
         trigger_word_ID: str = '<|image|>',
         trigger_word_facial: str = '<|facial|>',
         image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',   # Import CLIP pretrained model
-        bise_net_cp: str = 'JackAILab/ConsistentID/face_parsing.pth',
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
@@ -75,10 +95,11 @@ class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
         self.app.prepare(ctx_id=0, det_size=(512, 512)) ### (640, 640)
         ### BiSeNet
-        self.bise_net = BiSeNet(n_classes = 19)
-        self.bise_net.cuda()
-        self.bise_net_cp= bise_net_cp # Import BiSeNet model
-        self.bise_net.load_state_dict(torch.load(self.bise_net_cp)) # , map_location="cpu"
         self.bise_net.eval()
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
@@ -92,7 +113,7 @@ class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
                     [0, 255, 255], [85, 255, 255], [170, 255, 255]]
         ### LLVA Optional
-        self.llva_model_path = "" #TODO import llava weights
         self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
         self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)

 class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
+    def cuda(self, dtype=torch.float16, use_xformers=False):
+        self.to('cuda', dtype)
+        # if hasattr(self, 'image_proj_model'):
+        #     self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
+        if use_xformers:
+            if is_xformers_available():
+                import xformers
+                from packaging import version
+                xformers_version = version.parse(xformers.__version__)
+                if xformers_version == version.parse("0.0.16"):
+                    logger.warn(
+                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    )
+                self.enable_xformers_memory_efficient_attention()
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
     @validate_hf_hub_args
     def load_ConsistentID_model(
         self,
         pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        bise_net,
         weight_name: str,
         subfolder: str = '',
         trigger_word_ID: str = '<|image|>',
         trigger_word_facial: str = '<|facial|>',
         image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',   # Import CLIP pretrained model
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
         self.app.prepare(ctx_id=0, det_size=(512, 512)) ### (640, 640)
         ### BiSeNet
+        # self.bise_net = BiSeNet(n_classes = 19)
+        # self.bise_net.cuda()
+        # self.bise_net_cp= bise_net_cp # Import BiSeNet model
+        # self.bise_net.load_state_dict(torch.load(self.bise_net_cp)) # , map_location="cpu"
+        self.bise_net = bise_net # load from outside
         self.bise_net.eval()
         # Colors for all 20 parts
         self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
                     [0, 255, 255], [85, 255, 255], [170, 255, 255]]
         ### LLVA Optional
+        self.llva_model_path = "liuhaotian/llava-v1.5-13b" # import llava weights
         self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
         self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)