Spaces:
Running
on
Zero
Running
on
Zero
Update pipline_StableDiffusionXL_ConsistentID.py
Browse files
pipline_StableDiffusionXL_ConsistentID.py
CHANGED
|
@@ -42,17 +42,37 @@ PipelineImageInput = Union[
|
|
| 42 |
|
| 43 |
|
| 44 |
class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@validate_hf_hub_args
|
| 47 |
def load_ConsistentID_model(
|
| 48 |
self,
|
| 49 |
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
|
|
|
|
| 50 |
weight_name: str,
|
| 51 |
subfolder: str = '',
|
| 52 |
trigger_word_ID: str = '<|image|>',
|
| 53 |
trigger_word_facial: str = '<|facial|>',
|
| 54 |
image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K', # Import CLIP pretrained model
|
| 55 |
-
bise_net_cp: str = 'JackAILab/ConsistentID/face_parsing.pth',
|
| 56 |
torch_dtype = torch.float16,
|
| 57 |
num_tokens = 4,
|
| 58 |
lora_rank= 128,
|
|
@@ -75,10 +95,11 @@ class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
|
|
| 75 |
self.app.prepare(ctx_id=0, det_size=(512, 512)) ### (640, 640)
|
| 76 |
|
| 77 |
### BiSeNet
|
| 78 |
-
self.bise_net = BiSeNet(n_classes = 19)
|
| 79 |
-
self.bise_net.cuda()
|
| 80 |
-
self.bise_net_cp= bise_net_cp # Import BiSeNet model
|
| 81 |
-
self.bise_net.load_state_dict(torch.load(self.bise_net_cp)) # , map_location="cpu"
|
|
|
|
| 82 |
self.bise_net.eval()
|
| 83 |
# Colors for all 20 parts
|
| 84 |
self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
|
|
@@ -92,7 +113,7 @@ class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
|
|
| 92 |
[0, 255, 255], [85, 255, 255], [170, 255, 255]]
|
| 93 |
|
| 94 |
### LLVA Optional
|
| 95 |
-
self.llva_model_path = "" #
|
| 96 |
self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
|
| 97 |
self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)
|
| 98 |
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
class ConsistentIDStableDiffusionXLPipeline(StableDiffusionXLPipeline):
|
| 45 |
+
|
| 46 |
+
def cuda(self, dtype=torch.float16, use_xformers=False):
|
| 47 |
+
self.to('cuda', dtype)
|
| 48 |
+
|
| 49 |
+
# if hasattr(self, 'image_proj_model'):
|
| 50 |
+
# self.image_proj_model.to(self.unet.device).to(self.unet.dtype)
|
| 51 |
+
|
| 52 |
+
if use_xformers:
|
| 53 |
+
if is_xformers_available():
|
| 54 |
+
import xformers
|
| 55 |
+
from packaging import version
|
| 56 |
+
|
| 57 |
+
xformers_version = version.parse(xformers.__version__)
|
| 58 |
+
if xformers_version == version.parse("0.0.16"):
|
| 59 |
+
logger.warn(
|
| 60 |
+
"xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
|
| 61 |
+
)
|
| 62 |
+
self.enable_xformers_memory_efficient_attention()
|
| 63 |
+
else:
|
| 64 |
+
raise ValueError("xformers is not available. Make sure it is installed correctly")
|
| 65 |
|
| 66 |
@validate_hf_hub_args
|
| 67 |
def load_ConsistentID_model(
|
| 68 |
self,
|
| 69 |
pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
|
| 70 |
+
bise_net,
|
| 71 |
weight_name: str,
|
| 72 |
subfolder: str = '',
|
| 73 |
trigger_word_ID: str = '<|image|>',
|
| 74 |
trigger_word_facial: str = '<|facial|>',
|
| 75 |
image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K', # Import CLIP pretrained model
|
|
|
|
| 76 |
torch_dtype = torch.float16,
|
| 77 |
num_tokens = 4,
|
| 78 |
lora_rank= 128,
|
|
|
|
| 95 |
self.app.prepare(ctx_id=0, det_size=(512, 512)) ### (640, 640)
|
| 96 |
|
| 97 |
### BiSeNet
|
| 98 |
+
# self.bise_net = BiSeNet(n_classes = 19)
|
| 99 |
+
# self.bise_net.cuda()
|
| 100 |
+
# self.bise_net_cp= bise_net_cp # Import BiSeNet model
|
| 101 |
+
# self.bise_net.load_state_dict(torch.load(self.bise_net_cp)) # , map_location="cpu"
|
| 102 |
+
self.bise_net = bise_net # load from outside
|
| 103 |
self.bise_net.eval()
|
| 104 |
# Colors for all 20 parts
|
| 105 |
self.part_colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0],
|
|
|
|
| 113 |
[0, 255, 255], [85, 255, 255], [170, 255, 255]]
|
| 114 |
|
| 115 |
### LLVA Optional
|
| 116 |
+
self.llva_model_path = "liuhaotian/llava-v1.5-13b" # import llava weights
|
| 117 |
self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
|
| 118 |
self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)
|
| 119 |
|