JasonSmithSO commited on
Commit
d051564
·
verified ·
1 Parent(s): 8866644

Upload 304 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. PuLID_ComfyUI/LICENSE +201 -0
  3. PuLID_ComfyUI/README.md +37 -0
  4. PuLID_ComfyUI/__init__.py +3 -0
  5. PuLID_ComfyUI/encoders.py +63 -0
  6. PuLID_ComfyUI/eva_clip/__init__.py +11 -0
  7. PuLID_ComfyUI/eva_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
  8. PuLID_ComfyUI/eva_clip/constants.py +2 -0
  9. PuLID_ComfyUI/eva_clip/eva_vit_model.py +548 -0
  10. PuLID_ComfyUI/eva_clip/factory.py +517 -0
  11. PuLID_ComfyUI/eva_clip/hf_configs.py +57 -0
  12. PuLID_ComfyUI/eva_clip/hf_model.py +248 -0
  13. PuLID_ComfyUI/eva_clip/loss.py +138 -0
  14. PuLID_ComfyUI/eva_clip/model.py +439 -0
  15. PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-B-16.json +19 -0
  16. PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json +24 -0
  17. PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-g-14.json +24 -0
  18. PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-B-16.json +29 -0
  19. PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-L-14-336.json +29 -0
  20. PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-L-14.json +29 -0
  21. PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json +25 -0
  22. PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-bigE-14.json +25 -0
  23. PuLID_ComfyUI/eva_clip/modified_resnet.py +181 -0
  24. PuLID_ComfyUI/eva_clip/openai.py +144 -0
  25. PuLID_ComfyUI/eva_clip/pretrained.py +332 -0
  26. PuLID_ComfyUI/eva_clip/rope.py +137 -0
  27. PuLID_ComfyUI/eva_clip/timm_model.py +122 -0
  28. PuLID_ComfyUI/eva_clip/tokenizer.py +201 -0
  29. PuLID_ComfyUI/eva_clip/transform.py +103 -0
  30. PuLID_ComfyUI/eva_clip/transformer.py +737 -0
  31. PuLID_ComfyUI/eva_clip/utils.py +326 -0
  32. PuLID_ComfyUI/examples/PuLID_4-Step_lightning.json +631 -0
  33. PuLID_ComfyUI/examples/PuLID_IPAdapter_style_transfer.json +794 -0
  34. PuLID_ComfyUI/examples/PuLID_attention_mask.json +946 -0
  35. PuLID_ComfyUI/examples/PuLID_lightning_lora.json +649 -0
  36. PuLID_ComfyUI/examples/PuLID_simple.json +601 -0
  37. PuLID_ComfyUI/examples/pulid_wf.jpg +3 -0
  38. PuLID_ComfyUI/pulid.py +492 -0
  39. PuLID_ComfyUI/pyproject.toml +15 -0
  40. PuLID_ComfyUI/requirements.txt +6 -0
  41. example_node.py.example +155 -0
  42. rgthree-comfy/LICENSE +21 -0
  43. rgthree-comfy/README.md +411 -0
  44. rgthree-comfy/__build__.py +134 -0
  45. rgthree-comfy/__init__.py +321 -0
  46. rgthree-comfy/__update_comfy__.py +57 -0
  47. rgthree-comfy/docs/rgthree_advanced.png +3 -0
  48. rgthree-comfy/docs/rgthree_advanced_metadata.png +3 -0
  49. rgthree-comfy/docs/rgthree_context.png +3 -0
  50. rgthree-comfy/docs/rgthree_context_metadata.png +3 -0
.gitattributes CHANGED
@@ -45,3 +45,11 @@ ComfyUI-KJNodes/fonts/FreeMonoBoldOblique.otf filter=lfs diff=lfs merge=lfs -tex
45
  ComfyUI-KJNodes/fonts/TTNorms-Black.otf filter=lfs diff=lfs merge=lfs -text
46
  ComfyUI-Kolors-MZ/configs/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
47
  ComfyUI-KwaiKolorsWrapper/configs/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
45
  ComfyUI-KJNodes/fonts/TTNorms-Black.otf filter=lfs diff=lfs merge=lfs -text
46
  ComfyUI-Kolors-MZ/configs/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
47
  ComfyUI-KwaiKolorsWrapper/configs/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
48
+ PuLID_ComfyUI/examples/pulid_wf.jpg filter=lfs diff=lfs merge=lfs -text
49
+ rgthree-comfy/docs/rgthree_advanced_metadata.png filter=lfs diff=lfs merge=lfs -text
50
+ rgthree-comfy/docs/rgthree_advanced.png filter=lfs diff=lfs merge=lfs -text
51
+ rgthree-comfy/docs/rgthree_context_metadata.png filter=lfs diff=lfs merge=lfs -text
52
+ rgthree-comfy/docs/rgthree_context.png filter=lfs diff=lfs merge=lfs -text
53
+ x-flux-comfyui/assets/image1.png filter=lfs diff=lfs merge=lfs -text
54
+ x-flux-comfyui/guide/manager_menu.png filter=lfs diff=lfs merge=lfs -text
55
+ x-flux-comfyui/workflows/example.jpg filter=lfs diff=lfs merge=lfs -text
PuLID_ComfyUI/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
PuLID_ComfyUI/README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PuLID ComfyUI
2
+
3
+ [PuLID](https://github.com/ToTheBeginning/PuLID) ComfyUI native implementation.
4
+
5
+ ![basic workflow](examples/pulid_wf.jpg)
6
+
7
+ ## Important updates
8
+
9
+ - **2024.05.12:** Added attention masking and the Advanced node, allows fine tuning of the generation.
10
+
11
+ ## Notes
12
+
13
+ The code can be considered beta, things may change in the coming days. In the `examples` directory you'll find some basic workflows.
14
+
15
+ The original implementation makes use of a [4-step lighting UNet](https://huggingface.co/ByteDance/SDXL-Lightning). I made a few comparisons with the official Gradio demo using the same model in ComfyUI and I can't see any noticeable difference, meaning that this code should be faithful to the orignal. The Lightning lora doesn't work as well.
16
+
17
+ Testing other models though I noticed some quality degradation. You may need to experiment with CFG and various samplers/schedulers (try `sgm_uniform`).
18
+
19
+ **The quality of the reference image is very important**. Maybe this is because of the Eva CLIP that gets more details. Be sure to use a clean and sharp picture!
20
+
21
+ **For IPAdapter compatibility you need to update the IPAdapter extension!**
22
+
23
+ ## The 'method' parameter
24
+
25
+ `method` applies the weights in different ways. `Fidelity` is closer to the reference ID, `Style` leaves more freedom to the checkpoint. Sometimes the difference is minimal. I've added `neutral` that doesn't do any normalization, if you use this option with the standard Apply node be sure to lower the weight. With the Advanced node you can simply increase the `fidelity` value.
26
+
27
+ The Advanced node has a `fidelity` slider and a `projection` option. `ortho_v2` with `fidelity: 8` is the same as `fidelity` method in the standard node. Projection `ortho` and `fidelity: 16` is the same as method `style`.
28
+
29
+ **Lower `fidelity` values grant higher resemblance to the reference image.**
30
+
31
+ ## Installation
32
+
33
+ - [PuLID pre-trained model](https://huggingface.co/huchenlei/ipadapter_pulid/resolve/main/ip-adapter_pulid_sdxl_fp16.safetensors?download=true) goes in `ComfyUI/models/pulid/` (thanks to [Chenlei Hu](https://github.com/huchenlei) for converting them into IPAdapter format)
34
+ - The EVA CLIP is EVA02-CLIP-L-14-336, but should be downloaded automatically (will be located in the huggingface directory).
35
+ - `facexlib` dependency needs to be installed, the models are downloaded at first use
36
+ - Finally you need InsightFace with [AntelopeV2](https://huggingface.co/MonsterMMORPG/tools/tree/main), the unzipped models should be placed in `ComfyUI/models/insightface/models/antelopev2`.
37
+
PuLID_ComfyUI/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .pulid import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
2
+
3
+ __all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
PuLID_ComfyUI/encoders.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class IDEncoder(nn.Module):
5
+ def __init__(self, width=1280, context_dim=2048, num_token=5):
6
+ super().__init__()
7
+ self.num_token = num_token
8
+ self.context_dim = context_dim
9
+ h1 = min((context_dim * num_token) // 4, 1024)
10
+ h2 = min((context_dim * num_token) // 2, 1024)
11
+ self.body = nn.Sequential(
12
+ nn.Linear(width, h1),
13
+ nn.LayerNorm(h1),
14
+ nn.LeakyReLU(),
15
+ nn.Linear(h1, h2),
16
+ nn.LayerNorm(h2),
17
+ nn.LeakyReLU(),
18
+ nn.Linear(h2, context_dim * num_token),
19
+ )
20
+
21
+ for i in range(5):
22
+ setattr(
23
+ self,
24
+ f'mapping_{i}',
25
+ nn.Sequential(
26
+ nn.Linear(1024, 1024),
27
+ nn.LayerNorm(1024),
28
+ nn.LeakyReLU(),
29
+ nn.Linear(1024, 1024),
30
+ nn.LayerNorm(1024),
31
+ nn.LeakyReLU(),
32
+ nn.Linear(1024, context_dim),
33
+ ),
34
+ )
35
+
36
+ setattr(
37
+ self,
38
+ f'mapping_patch_{i}',
39
+ nn.Sequential(
40
+ nn.Linear(1024, 1024),
41
+ nn.LayerNorm(1024),
42
+ nn.LeakyReLU(),
43
+ nn.Linear(1024, 1024),
44
+ nn.LayerNorm(1024),
45
+ nn.LeakyReLU(),
46
+ nn.Linear(1024, context_dim),
47
+ ),
48
+ )
49
+
50
+ def forward(self, x, y):
51
+ # x shape [N, C]
52
+ x = self.body(x)
53
+ x = x.reshape(-1, self.num_token, self.context_dim)
54
+
55
+ hidden_states = ()
56
+ for i, emb in enumerate(y):
57
+ hidden_state = getattr(self, f'mapping_{i}')(emb[:, :1]) + getattr(self, f'mapping_patch_{i}')(
58
+ emb[:, 1:]
59
+ ).mean(dim=1, keepdim=True)
60
+ hidden_states += (hidden_state,)
61
+ hidden_states = torch.cat(hidden_states, dim=1)
62
+
63
+ return torch.cat([x, hidden_states], dim=1)
PuLID_ComfyUI/eva_clip/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
2
+ from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_transforms
3
+ from .factory import list_models, add_model_config, get_model_config, load_checkpoint
4
+ from .loss import ClipLoss
5
+ from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\
6
+ convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
7
+ from .openai import load_openai_model, list_openai_models
8
+ from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\
9
+ get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
10
+ from .tokenizer import SimpleTokenizer, tokenize
11
+ from .transform import image_transform
PuLID_ComfyUI/eva_clip/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
PuLID_ComfyUI/eva_clip/constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2
+ OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
PuLID_ComfyUI/eva_clip/eva_vit_model.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # Adapted from https://github.com/microsoft/unilm/tree/master/beit
3
+ # --------------------------------------------------------
4
+ import math
5
+ import os
6
+ from functools import partial
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+ try:
11
+ from timm.models.layers import drop_path, to_2tuple, trunc_normal_
12
+ except:
13
+ from timm.layers import drop_path, to_2tuple, trunc_normal_
14
+
15
+ from .transformer import PatchDropout
16
+ from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast
17
+
18
+ if os.getenv('ENV_TYPE') == 'deepspeed':
19
+ try:
20
+ from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
21
+ except:
22
+ from torch.utils.checkpoint import checkpoint
23
+ else:
24
+ from torch.utils.checkpoint import checkpoint
25
+
26
+ try:
27
+ import xformers
28
+ import xformers.ops as xops
29
+ XFORMERS_IS_AVAILBLE = True
30
+ except:
31
+ XFORMERS_IS_AVAILBLE = False
32
+
33
+ class DropPath(nn.Module):
34
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
35
+ """
36
+ def __init__(self, drop_prob=None):
37
+ super(DropPath, self).__init__()
38
+ self.drop_prob = drop_prob
39
+
40
+ def forward(self, x):
41
+ return drop_path(x, self.drop_prob, self.training)
42
+
43
+ def extra_repr(self) -> str:
44
+ return 'p={}'.format(self.drop_prob)
45
+
46
+
47
+ class Mlp(nn.Module):
48
+ def __init__(
49
+ self,
50
+ in_features,
51
+ hidden_features=None,
52
+ out_features=None,
53
+ act_layer=nn.GELU,
54
+ norm_layer=nn.LayerNorm,
55
+ drop=0.,
56
+ subln=False,
57
+
58
+ ):
59
+ super().__init__()
60
+ out_features = out_features or in_features
61
+ hidden_features = hidden_features or in_features
62
+ self.fc1 = nn.Linear(in_features, hidden_features)
63
+ self.act = act_layer()
64
+
65
+ self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
66
+
67
+ self.fc2 = nn.Linear(hidden_features, out_features)
68
+ self.drop = nn.Dropout(drop)
69
+
70
+ def forward(self, x):
71
+ x = self.fc1(x)
72
+ x = self.act(x)
73
+ # x = self.drop(x)
74
+ # commit this for the orignal BERT implement
75
+ x = self.ffn_ln(x)
76
+
77
+ x = self.fc2(x)
78
+ x = self.drop(x)
79
+ return x
80
+
81
+ class SwiGLU(nn.Module):
82
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
83
+ norm_layer=nn.LayerNorm, subln=False):
84
+ super().__init__()
85
+ out_features = out_features or in_features
86
+ hidden_features = hidden_features or in_features
87
+
88
+ self.w1 = nn.Linear(in_features, hidden_features)
89
+ self.w2 = nn.Linear(in_features, hidden_features)
90
+
91
+ self.act = act_layer()
92
+ self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
93
+ self.w3 = nn.Linear(hidden_features, out_features)
94
+
95
+ self.drop = nn.Dropout(drop)
96
+
97
+ def forward(self, x):
98
+ x1 = self.w1(x)
99
+ x2 = self.w2(x)
100
+ hidden = self.act(x1) * x2
101
+ x = self.ffn_ln(hidden)
102
+ x = self.w3(x)
103
+ x = self.drop(x)
104
+ return x
105
+
106
+ class Attention(nn.Module):
107
+ def __init__(
108
+ self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
109
+ proj_drop=0., window_size=None, attn_head_dim=None, xattn=False, rope=None, subln=False, norm_layer=nn.LayerNorm):
110
+ super().__init__()
111
+ self.num_heads = num_heads
112
+ head_dim = dim // num_heads
113
+ if attn_head_dim is not None:
114
+ head_dim = attn_head_dim
115
+ all_head_dim = head_dim * self.num_heads
116
+ self.scale = qk_scale or head_dim ** -0.5
117
+
118
+ self.subln = subln
119
+ if self.subln:
120
+ self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
121
+ self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
122
+ self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
123
+ else:
124
+ self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
125
+
126
+ if qkv_bias:
127
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
128
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
129
+ else:
130
+ self.q_bias = None
131
+ self.v_bias = None
132
+
133
+ if window_size:
134
+ self.window_size = window_size
135
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
136
+ self.relative_position_bias_table = nn.Parameter(
137
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
138
+ # cls to token & token 2 cls & cls to cls
139
+
140
+ # get pair-wise relative position index for each token inside the window
141
+ coords_h = torch.arange(window_size[0])
142
+ coords_w = torch.arange(window_size[1])
143
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
144
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
145
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
146
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
147
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
148
+ relative_coords[:, :, 1] += window_size[1] - 1
149
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
150
+ relative_position_index = \
151
+ torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
152
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
153
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
154
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
155
+ relative_position_index[0, 0] = self.num_relative_distance - 1
156
+
157
+ self.register_buffer("relative_position_index", relative_position_index)
158
+ else:
159
+ self.window_size = None
160
+ self.relative_position_bias_table = None
161
+ self.relative_position_index = None
162
+
163
+ self.attn_drop = nn.Dropout(attn_drop)
164
+ self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
165
+ # self.proj = nn.Linear(all_head_dim, all_head_dim)
166
+ self.proj = nn.Linear(all_head_dim, dim)
167
+ self.proj_drop = nn.Dropout(proj_drop)
168
+ self.xattn = xattn
169
+ self.xattn_drop = attn_drop
170
+
171
+ self.rope = rope
172
+
173
+ def forward(self, x, rel_pos_bias=None, attn_mask=None):
174
+ B, N, C = x.shape
175
+ if self.subln:
176
+ q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
177
+ k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
178
+ v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
179
+
180
+ q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) # B, num_heads, N, C
181
+ k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
182
+ v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
183
+ else:
184
+
185
+ qkv_bias = None
186
+ if self.q_bias is not None:
187
+ qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
188
+
189
+ qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
190
+ qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) # 3, B, num_heads, N, C
191
+ q, k, v = qkv[0], qkv[1], qkv[2]
192
+
193
+ if self.rope:
194
+ # slightly fast impl
195
+ q_t = q[:, :, 1:, :]
196
+ ro_q_t = self.rope(q_t)
197
+ q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
198
+
199
+ k_t = k[:, :, 1:, :]
200
+ ro_k_t = self.rope(k_t)
201
+ k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
202
+
203
+ if self.xattn:
204
+ q = q.permute(0, 2, 1, 3) # B, num_heads, N, C -> B, N, num_heads, C
205
+ k = k.permute(0, 2, 1, 3)
206
+ v = v.permute(0, 2, 1, 3)
207
+
208
+ x = xops.memory_efficient_attention(
209
+ q, k, v,
210
+ p=self.xattn_drop,
211
+ scale=self.scale,
212
+ )
213
+ x = x.reshape(B, N, -1)
214
+ x = self.inner_attn_ln(x)
215
+ x = self.proj(x)
216
+ x = self.proj_drop(x)
217
+ else:
218
+ q = q * self.scale
219
+ attn = (q @ k.transpose(-2, -1))
220
+
221
+ if self.relative_position_bias_table is not None:
222
+ relative_position_bias = \
223
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
224
+ self.window_size[0] * self.window_size[1] + 1,
225
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
226
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
227
+ attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
228
+
229
+ if rel_pos_bias is not None:
230
+ attn = attn + rel_pos_bias.type_as(attn)
231
+
232
+ if attn_mask is not None:
233
+ attn_mask = attn_mask.bool()
234
+ attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
235
+
236
+ attn = attn.softmax(dim=-1)
237
+ attn = self.attn_drop(attn)
238
+
239
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
240
+ x = self.inner_attn_ln(x)
241
+ x = self.proj(x)
242
+ x = self.proj_drop(x)
243
+ return x
244
+
245
+
246
+ class Block(nn.Module):
247
+
248
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
249
+ drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
250
+ window_size=None, attn_head_dim=None, xattn=False, rope=None, postnorm=False,
251
+ subln=False, naiveswiglu=False):
252
+ super().__init__()
253
+ self.norm1 = norm_layer(dim)
254
+ self.attn = Attention(
255
+ dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
256
+ attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim,
257
+ xattn=xattn, rope=rope, subln=subln, norm_layer=norm_layer)
258
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
259
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
260
+ self.norm2 = norm_layer(dim)
261
+ mlp_hidden_dim = int(dim * mlp_ratio)
262
+
263
+ if naiveswiglu:
264
+ self.mlp = SwiGLU(
265
+ in_features=dim,
266
+ hidden_features=mlp_hidden_dim,
267
+ subln=subln,
268
+ norm_layer=norm_layer,
269
+ )
270
+ else:
271
+ self.mlp = Mlp(
272
+ in_features=dim,
273
+ hidden_features=mlp_hidden_dim,
274
+ act_layer=act_layer,
275
+ subln=subln,
276
+ drop=drop
277
+ )
278
+
279
+ if init_values is not None and init_values > 0:
280
+ self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
281
+ self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
282
+ else:
283
+ self.gamma_1, self.gamma_2 = None, None
284
+
285
+ self.postnorm = postnorm
286
+
287
+ def forward(self, x, rel_pos_bias=None, attn_mask=None):
288
+ if self.gamma_1 is None:
289
+ if self.postnorm:
290
+ x = x + self.drop_path(self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
291
+ x = x + self.drop_path(self.norm2(self.mlp(x)))
292
+ else:
293
+ x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
294
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
295
+ else:
296
+ if self.postnorm:
297
+ x = x + self.drop_path(self.gamma_1 * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)))
298
+ x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
299
+ else:
300
+ x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
301
+ x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
302
+ return x
303
+
304
+
305
+ class PatchEmbed(nn.Module):
306
+ """ Image to Patch Embedding
307
+ """
308
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
309
+ super().__init__()
310
+ img_size = to_2tuple(img_size)
311
+ patch_size = to_2tuple(patch_size)
312
+ num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
313
+ self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
314
+ self.img_size = img_size
315
+ self.patch_size = patch_size
316
+ self.num_patches = num_patches
317
+
318
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
319
+
320
+ def forward(self, x, **kwargs):
321
+ B, C, H, W = x.shape
322
+ # FIXME look at relaxing size constraints
323
+ assert H == self.img_size[0] and W == self.img_size[1], \
324
+ f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
325
+ x = self.proj(x).flatten(2).transpose(1, 2)
326
+ return x
327
+
328
+
329
+ class RelativePositionBias(nn.Module):
330
+
331
+ def __init__(self, window_size, num_heads):
332
+ super().__init__()
333
+ self.window_size = window_size
334
+ self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
335
+ self.relative_position_bias_table = nn.Parameter(
336
+ torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH
337
+ # cls to token & token 2 cls & cls to cls
338
+
339
+ # get pair-wise relative position index for each token inside the window
340
+ coords_h = torch.arange(window_size[0])
341
+ coords_w = torch.arange(window_size[1])
342
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
343
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
344
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
345
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
346
+ relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
347
+ relative_coords[:, :, 1] += window_size[1] - 1
348
+ relative_coords[:, :, 0] *= 2 * window_size[1] - 1
349
+ relative_position_index = \
350
+ torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
351
+ relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
352
+ relative_position_index[0, 0:] = self.num_relative_distance - 3
353
+ relative_position_index[0:, 0] = self.num_relative_distance - 2
354
+ relative_position_index[0, 0] = self.num_relative_distance - 1
355
+
356
+ self.register_buffer("relative_position_index", relative_position_index)
357
+
358
+ def forward(self):
359
+ relative_position_bias = \
360
+ self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
361
+ self.window_size[0] * self.window_size[1] + 1,
362
+ self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH
363
+ return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
364
+
365
+
366
+ class EVAVisionTransformer(nn.Module):
367
+ """ Vision Transformer with support for patch or hybrid CNN input stage
368
+ """
369
+ def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
370
+ num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
371
+ drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, patch_dropout=0.,
372
+ use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, rope=False,
373
+ use_mean_pooling=True, init_scale=0.001, grad_checkpointing=False, xattn=False, postnorm=False,
374
+ pt_hw_seq_len=16, intp_freq=False, naiveswiglu=False, subln=False):
375
+ super().__init__()
376
+
377
+ if not XFORMERS_IS_AVAILBLE:
378
+ xattn = False
379
+
380
+ self.image_size = img_size
381
+ self.num_classes = num_classes
382
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
383
+
384
+ self.patch_embed = PatchEmbed(
385
+ img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
386
+ num_patches = self.patch_embed.num_patches
387
+
388
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
389
+ # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
390
+ if use_abs_pos_emb:
391
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
392
+ else:
393
+ self.pos_embed = None
394
+ self.pos_drop = nn.Dropout(p=drop_rate)
395
+
396
+ if use_shared_rel_pos_bias:
397
+ self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
398
+ else:
399
+ self.rel_pos_bias = None
400
+
401
+ if rope:
402
+ half_head_dim = embed_dim // num_heads // 2
403
+ hw_seq_len = img_size // patch_size
404
+ self.rope = VisionRotaryEmbeddingFast(
405
+ dim=half_head_dim,
406
+ pt_seq_len=pt_hw_seq_len,
407
+ ft_seq_len=hw_seq_len if intp_freq else None,
408
+ # patch_dropout=patch_dropout
409
+ )
410
+ else:
411
+ self.rope = None
412
+
413
+ self.naiveswiglu = naiveswiglu
414
+
415
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
416
+ self.use_rel_pos_bias = use_rel_pos_bias
417
+ self.blocks = nn.ModuleList([
418
+ Block(
419
+ dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
420
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
421
+ init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
422
+ xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
423
+ for i in range(depth)])
424
+ self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
425
+ self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
426
+ self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
427
+
428
+ if self.pos_embed is not None:
429
+ trunc_normal_(self.pos_embed, std=.02)
430
+
431
+ trunc_normal_(self.cls_token, std=.02)
432
+ # trunc_normal_(self.mask_token, std=.02)
433
+
434
+ self.apply(self._init_weights)
435
+ self.fix_init_weight()
436
+
437
+ if isinstance(self.head, nn.Linear):
438
+ trunc_normal_(self.head.weight, std=.02)
439
+ self.head.weight.data.mul_(init_scale)
440
+ self.head.bias.data.mul_(init_scale)
441
+
442
+ # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
443
+ self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
444
+
445
+ self.grad_checkpointing = grad_checkpointing
446
+
447
+ def fix_init_weight(self):
448
+ def rescale(param, layer_id):
449
+ param.div_(math.sqrt(2.0 * layer_id))
450
+
451
+ for layer_id, layer in enumerate(self.blocks):
452
+ rescale(layer.attn.proj.weight.data, layer_id + 1)
453
+ if self.naiveswiglu:
454
+ rescale(layer.mlp.w3.weight.data, layer_id + 1)
455
+ else:
456
+ rescale(layer.mlp.fc2.weight.data, layer_id + 1)
457
+
458
+ def get_cast_dtype(self) -> torch.dtype:
459
+ return self.blocks[0].mlp.fc2.weight.dtype
460
+
461
+ def _init_weights(self, m):
462
+ if isinstance(m, nn.Linear):
463
+ trunc_normal_(m.weight, std=.02)
464
+ if m.bias is not None:
465
+ nn.init.constant_(m.bias, 0)
466
+ elif isinstance(m, nn.LayerNorm):
467
+ nn.init.constant_(m.bias, 0)
468
+ nn.init.constant_(m.weight, 1.0)
469
+
470
+ def get_num_layers(self):
471
+ return len(self.blocks)
472
+
473
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
474
+ assert unlocked_groups == 0, 'partial locking not currently supported for this model'
475
+ for param in self.parameters():
476
+ param.requires_grad = False
477
+
478
+ @torch.jit.ignore
479
+ def set_grad_checkpointing(self, enable=True):
480
+ self.grad_checkpointing = enable
481
+
482
+ @torch.jit.ignore
483
+ def no_weight_decay(self):
484
+ return {'pos_embed', 'cls_token'}
485
+
486
+ def get_classifier(self):
487
+ return self.head
488
+
489
+ def reset_classifier(self, num_classes, global_pool=''):
490
+ self.num_classes = num_classes
491
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
492
+
493
+ def forward_features(self, x, return_all_features=False, return_hidden=False, shuffle=False):
494
+
495
+ x = self.patch_embed(x)
496
+ batch_size, seq_len, _ = x.size()
497
+
498
+ if shuffle:
499
+ idx = torch.randperm(x.shape[1]) + 1
500
+ zero = torch.LongTensor([0, ])
501
+ idx = torch.cat([zero, idx])
502
+ pos_embed = self.pos_embed[:, idx]
503
+
504
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
505
+ x = torch.cat((cls_tokens, x), dim=1)
506
+ if shuffle:
507
+ x = x + pos_embed
508
+ elif self.pos_embed is not None:
509
+ x = x + self.pos_embed
510
+ x = self.pos_drop(x)
511
+
512
+ # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
513
+ if os.getenv('RoPE') == '1':
514
+ if self.training and not isinstance(self.patch_dropout, nn.Identity):
515
+ x, patch_indices_keep = self.patch_dropout(x)
516
+ self.rope.forward = partial(self.rope.forward, patch_indices_keep=patch_indices_keep)
517
+ else:
518
+ self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
519
+ x = self.patch_dropout(x)
520
+ else:
521
+ x = self.patch_dropout(x)
522
+
523
+ rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
524
+ hidden_states = []
525
+ for idx, blk in enumerate(self.blocks):
526
+ if (0 < idx <= 20) and (idx % 4 == 0) and return_hidden:
527
+ hidden_states.append(x)
528
+ if self.grad_checkpointing:
529
+ x = checkpoint(blk, x, (rel_pos_bias,))
530
+ else:
531
+ x = blk(x, rel_pos_bias=rel_pos_bias)
532
+
533
+ if not return_all_features:
534
+ x = self.norm(x)
535
+ if self.fc_norm is not None:
536
+ return self.fc_norm(x.mean(1)), hidden_states
537
+ else:
538
+ return x[:, 0], hidden_states
539
+ return x
540
+
541
+ def forward(self, x, return_all_features=False, return_hidden=False, shuffle=False):
542
+ if return_all_features:
543
+ return self.forward_features(x, return_all_features, return_hidden, shuffle)
544
+ x, hidden_states = self.forward_features(x, return_all_features, return_hidden, shuffle)
545
+ x = self.head(x)
546
+ if return_hidden:
547
+ return x, hidden_states
548
+ return x
PuLID_ComfyUI/eva_clip/factory.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import pathlib
5
+ import re
6
+ from copy import deepcopy
7
+ from pathlib import Path
8
+ from typing import Optional, Tuple, Union, Dict, Any
9
+ import torch
10
+
11
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
12
+ from .model import CLIP, CustomCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
13
+ get_cast_dtype
14
+ from .openai import load_openai_model
15
+ from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model
16
+ from .transform import image_transform
17
+ from .tokenizer import HFTokenizer, tokenize
18
+ from .utils import resize_clip_pos_embed, resize_evaclip_pos_embed, resize_visual_pos_embed, resize_eva_pos_embed
19
+
20
+
21
+ _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
22
+ _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs
23
+
24
+
25
+ def _natural_key(string_):
26
+ return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
27
+
28
+
29
+ def _rescan_model_configs():
30
+ global _MODEL_CONFIGS
31
+
32
+ config_ext = ('.json',)
33
+ config_files = []
34
+ for config_path in _MODEL_CONFIG_PATHS:
35
+ if config_path.is_file() and config_path.suffix in config_ext:
36
+ config_files.append(config_path)
37
+ elif config_path.is_dir():
38
+ for ext in config_ext:
39
+ config_files.extend(config_path.glob(f'*{ext}'))
40
+
41
+ for cf in config_files:
42
+ with open(cf, "r", encoding="utf8") as f:
43
+ model_cfg = json.load(f)
44
+ if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
45
+ _MODEL_CONFIGS[cf.stem] = model_cfg
46
+
47
+ _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
48
+
49
+
50
+ _rescan_model_configs() # initial populate of model config registry
51
+
52
+
53
+ def list_models():
54
+ """ enumerate available model architectures based on config files """
55
+ return list(_MODEL_CONFIGS.keys())
56
+
57
+
58
+ def add_model_config(path):
59
+ """ add model config path or file and update registry """
60
+ if not isinstance(path, Path):
61
+ path = Path(path)
62
+ _MODEL_CONFIG_PATHS.append(path)
63
+ _rescan_model_configs()
64
+
65
+
66
+ def get_model_config(model_name):
67
+ if model_name in _MODEL_CONFIGS:
68
+ return deepcopy(_MODEL_CONFIGS[model_name])
69
+ else:
70
+ return None
71
+
72
+
73
+ def get_tokenizer(model_name):
74
+ config = get_model_config(model_name)
75
+ tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) if 'hf_tokenizer_name' in config['text_cfg'] else tokenize
76
+ return tokenizer
77
+
78
+
79
+ # loading openai CLIP weights when is_openai=True for training
80
+ def load_state_dict(checkpoint_path: str, map_location: str='cpu', model_key: str='model|module|state_dict', is_openai: bool=False, skip_list: list=[]):
81
+ if is_openai:
82
+ model = torch.jit.load(checkpoint_path, map_location="cpu").eval()
83
+ state_dict = model.state_dict()
84
+ for key in ["input_resolution", "context_length", "vocab_size"]:
85
+ state_dict.pop(key, None)
86
+ else:
87
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
88
+ for mk in model_key.split('|'):
89
+ if isinstance(checkpoint, dict) and mk in checkpoint:
90
+ state_dict = checkpoint[mk]
91
+ break
92
+ else:
93
+ state_dict = checkpoint
94
+ if next(iter(state_dict.items()))[0].startswith('module'):
95
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
96
+
97
+ for k in skip_list:
98
+ if k in list(state_dict.keys()):
99
+ logging.info(f"Removing key {k} from pretrained checkpoint")
100
+ del state_dict[k]
101
+
102
+ if os.getenv('RoPE') == '1':
103
+ for k in list(state_dict.keys()):
104
+ if 'freqs_cos' in k or 'freqs_sin' in k:
105
+ del state_dict[k]
106
+ return state_dict
107
+
108
+
109
+
110
+ def load_checkpoint(model, checkpoint_path, model_key="model|module|state_dict", strict=True):
111
+ state_dict = load_state_dict(checkpoint_path, model_key=model_key, is_openai=False)
112
+ # detect old format and make compatible with new format
113
+ if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
114
+ state_dict = convert_to_custom_text_state_dict(state_dict)
115
+ if 'text.logit_scale' in state_dict and hasattr(model, 'logit_scale'):
116
+ state_dict['logit_scale'] = state_dict['text.logit_scale']
117
+ del state_dict['text.logit_scale']
118
+
119
+ # resize_clip_pos_embed for CLIP and open CLIP
120
+ if 'visual.positional_embedding' in state_dict:
121
+ resize_clip_pos_embed(state_dict, model)
122
+ # specified to eva_vit_model
123
+ elif 'visual.pos_embed' in state_dict:
124
+ resize_evaclip_pos_embed(state_dict, model)
125
+
126
+ # resize_clip_pos_embed(state_dict, model)
127
+ incompatible_keys = model.load_state_dict(state_dict, strict=strict)
128
+ logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}")
129
+ return incompatible_keys
130
+
131
+ def load_clip_visual_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
132
+ state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
133
+
134
+ for k in list(state_dict.keys()):
135
+ if not k.startswith('visual.'):
136
+ del state_dict[k]
137
+ for k in list(state_dict.keys()):
138
+ if k.startswith('visual.'):
139
+ new_k = k[7:]
140
+ state_dict[new_k] = state_dict[k]
141
+ del state_dict[k]
142
+ return state_dict
143
+
144
+ def load_clip_text_state_dict(checkpoint_path: str, map_location: str='cpu', is_openai: bool=False, skip_list:list=[]):
145
+ state_dict = load_state_dict(checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list)
146
+
147
+ for k in list(state_dict.keys()):
148
+ if k.startswith('visual.'):
149
+ del state_dict[k]
150
+ return state_dict
151
+
152
+ def get_pretrained_tag(pretrained_model):
153
+ pretrained_model = pretrained_model.lower()
154
+ if "laion" in pretrained_model or "open_clip" in pretrained_model:
155
+ return "open_clip"
156
+ elif "openai" in pretrained_model:
157
+ return "clip"
158
+ elif "eva" in pretrained_model and "clip" in pretrained_model:
159
+ return "eva_clip"
160
+ else:
161
+ return "other"
162
+
163
+ def load_pretrained_checkpoint(
164
+ model,
165
+ visual_checkpoint_path,
166
+ text_checkpoint_path,
167
+ strict=True,
168
+ visual_model=None,
169
+ text_model=None,
170
+ model_key="model|module|state_dict",
171
+ skip_list=[]):
172
+ visual_tag = get_pretrained_tag(visual_model)
173
+ text_tag = get_pretrained_tag(text_model)
174
+
175
+ logging.info(f"num of model state_dict keys: {len(model.state_dict().keys())}")
176
+ visual_incompatible_keys, text_incompatible_keys = None, None
177
+ if visual_checkpoint_path:
178
+ if visual_tag == "eva_clip" or visual_tag == "open_clip":
179
+ visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=False, skip_list=skip_list)
180
+ elif visual_tag == "clip":
181
+ visual_state_dict = load_clip_visual_state_dict(visual_checkpoint_path, is_openai=True, skip_list=skip_list)
182
+ else:
183
+ visual_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
184
+
185
+ # resize_clip_pos_embed for CLIP and open CLIP
186
+ if 'positional_embedding' in visual_state_dict:
187
+ resize_visual_pos_embed(visual_state_dict, model)
188
+ # specified to EVA model
189
+ elif 'pos_embed' in visual_state_dict:
190
+ resize_eva_pos_embed(visual_state_dict, model)
191
+
192
+ visual_incompatible_keys = model.visual.load_state_dict(visual_state_dict, strict=strict)
193
+ logging.info(f"num of loaded visual_state_dict keys: {len(visual_state_dict.keys())}")
194
+ logging.info(f"visual_incompatible_keys.missing_keys: {visual_incompatible_keys.missing_keys}")
195
+
196
+ if text_checkpoint_path:
197
+ if text_tag == "eva_clip" or text_tag == "open_clip":
198
+ text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=False, skip_list=skip_list)
199
+ elif text_tag == "clip":
200
+ text_state_dict = load_clip_text_state_dict(text_checkpoint_path, is_openai=True, skip_list=skip_list)
201
+ else:
202
+ text_state_dict = load_state_dict(visual_checkpoint_path, model_key=model_key, is_openai=False, skip_list=skip_list)
203
+
204
+ text_incompatible_keys = model.text.load_state_dict(text_state_dict, strict=strict)
205
+
206
+ logging.info(f"num of loaded text_state_dict keys: {len(text_state_dict.keys())}")
207
+ logging.info(f"text_incompatible_keys.missing_keys: {text_incompatible_keys.missing_keys}")
208
+
209
+ return visual_incompatible_keys, text_incompatible_keys
210
+
211
+ def create_model(
212
+ model_name: str,
213
+ pretrained: Optional[str] = None,
214
+ precision: str = 'fp32',
215
+ device: Union[str, torch.device] = 'cpu',
216
+ jit: bool = False,
217
+ force_quick_gelu: bool = False,
218
+ force_custom_clip: bool = False,
219
+ force_patch_dropout: Optional[float] = None,
220
+ pretrained_image: str = '',
221
+ pretrained_text: str = '',
222
+ pretrained_hf: bool = True,
223
+ pretrained_visual_model: str = None,
224
+ pretrained_text_model: str = None,
225
+ cache_dir: Optional[str] = None,
226
+ skip_list: list = [],
227
+ ):
228
+ model_name = model_name.replace('/', '-') # for callers using old naming with / in ViT names
229
+ if isinstance(device, str):
230
+ device = torch.device(device)
231
+
232
+ if pretrained and pretrained.lower() == 'openai':
233
+ logging.info(f'Loading pretrained {model_name} from OpenAI.')
234
+ model = load_openai_model(
235
+ model_name,
236
+ precision=precision,
237
+ device=device,
238
+ jit=jit,
239
+ cache_dir=cache_dir,
240
+ )
241
+ else:
242
+ model_cfg = get_model_config(model_name)
243
+ if model_cfg is not None:
244
+ logging.info(f'Loaded {model_name} model config.')
245
+ else:
246
+ logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
247
+ raise RuntimeError(f'Model config for {model_name} not found.')
248
+
249
+ if 'rope' in model_cfg.get('vision_cfg', {}):
250
+ if model_cfg['vision_cfg']['rope']:
251
+ os.environ['RoPE'] = "1"
252
+ else:
253
+ os.environ['RoPE'] = "0"
254
+
255
+ if force_quick_gelu:
256
+ # override for use of QuickGELU on non-OpenAI transformer models
257
+ model_cfg["quick_gelu"] = True
258
+
259
+ if force_patch_dropout is not None:
260
+ # override the default patch dropout value
261
+ model_cfg['vision_cfg']["patch_dropout"] = force_patch_dropout
262
+
263
+ cast_dtype = get_cast_dtype(precision)
264
+ custom_clip = model_cfg.pop('custom_text', False) or force_custom_clip or ('hf_model_name' in model_cfg['text_cfg'])
265
+
266
+
267
+ if custom_clip:
268
+ if 'hf_model_name' in model_cfg.get('text_cfg', {}):
269
+ model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
270
+ model = CustomCLIP(**model_cfg, cast_dtype=cast_dtype)
271
+ else:
272
+ model = CLIP(**model_cfg, cast_dtype=cast_dtype)
273
+
274
+ pretrained_cfg = {}
275
+ if pretrained:
276
+ checkpoint_path = ''
277
+ pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
278
+ if pretrained_cfg:
279
+ checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
280
+ elif os.path.exists(pretrained):
281
+ checkpoint_path = pretrained
282
+
283
+ if checkpoint_path:
284
+ logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
285
+ load_checkpoint(model,
286
+ checkpoint_path,
287
+ model_key="model|module|state_dict",
288
+ strict=False
289
+ )
290
+ else:
291
+ error_str = (
292
+ f'Pretrained weights ({pretrained}) not found for model {model_name}.'
293
+ f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
294
+ logging.warning(error_str)
295
+ raise RuntimeError(error_str)
296
+ else:
297
+ visual_checkpoint_path = ''
298
+ text_checkpoint_path = ''
299
+
300
+ if pretrained_image:
301
+ pretrained_visual_model = pretrained_visual_model.replace('/', '-') # for callers using old naming with / in ViT names
302
+ pretrained_image_cfg = get_pretrained_cfg(pretrained_visual_model, pretrained_image)
303
+ if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
304
+ # pretrained weight loading for timm models set via vision_cfg
305
+ model_cfg['vision_cfg']['timm_model_pretrained'] = True
306
+ elif pretrained_image_cfg:
307
+ visual_checkpoint_path = download_pretrained(pretrained_image_cfg, cache_dir=cache_dir)
308
+ elif os.path.exists(pretrained_image):
309
+ visual_checkpoint_path = pretrained_image
310
+ else:
311
+ logging.warning(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
312
+ raise RuntimeError(f'Pretrained weights ({visual_checkpoint_path}) not found for model {model_name}.visual.')
313
+
314
+ if pretrained_text:
315
+ pretrained_text_model = pretrained_text_model.replace('/', '-') # for callers using old naming with / in ViT names
316
+ pretrained_text_cfg = get_pretrained_cfg(pretrained_text_model, pretrained_text)
317
+ if pretrained_image_cfg:
318
+ text_checkpoint_path = download_pretrained(pretrained_text_cfg, cache_dir=cache_dir)
319
+ elif os.path.exists(pretrained_text):
320
+ text_checkpoint_path = pretrained_text
321
+ else:
322
+ logging.warning(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
323
+ raise RuntimeError(f'Pretrained weights ({text_checkpoint_path}) not found for model {model_name}.text.')
324
+
325
+ if visual_checkpoint_path:
326
+ logging.info(f'Loading pretrained {model_name}.visual weights ({visual_checkpoint_path}).')
327
+ if text_checkpoint_path:
328
+ logging.info(f'Loading pretrained {model_name}.text weights ({text_checkpoint_path}).')
329
+
330
+ if visual_checkpoint_path or text_checkpoint_path:
331
+ load_pretrained_checkpoint(
332
+ model,
333
+ visual_checkpoint_path,
334
+ text_checkpoint_path,
335
+ strict=False,
336
+ visual_model=pretrained_visual_model,
337
+ text_model=pretrained_text_model,
338
+ model_key="model|module|state_dict",
339
+ skip_list=skip_list
340
+ )
341
+
342
+ if "fp16" in precision or "bf16" in precision:
343
+ logging.info(f'convert precision to {precision}')
344
+ model = model.to(torch.bfloat16) if 'bf16' in precision else model.to(torch.float16)
345
+
346
+ model.to(device=device)
347
+
348
+ # set image / mean metadata from pretrained_cfg if available, or use default
349
+ model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
350
+ model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD
351
+
352
+ if jit:
353
+ model = torch.jit.script(model)
354
+
355
+ return model
356
+
357
+
358
+ def create_model_and_transforms(
359
+ model_name: str,
360
+ pretrained: Optional[str] = None,
361
+ precision: str = 'fp32',
362
+ device: Union[str, torch.device] = 'cpu',
363
+ jit: bool = False,
364
+ force_quick_gelu: bool = False,
365
+ force_custom_clip: bool = False,
366
+ force_patch_dropout: Optional[float] = None,
367
+ pretrained_image: str = '',
368
+ pretrained_text: str = '',
369
+ pretrained_hf: bool = True,
370
+ pretrained_visual_model: str = None,
371
+ pretrained_text_model: str = None,
372
+ image_mean: Optional[Tuple[float, ...]] = None,
373
+ image_std: Optional[Tuple[float, ...]] = None,
374
+ cache_dir: Optional[str] = None,
375
+ skip_list: list = [],
376
+ ):
377
+ model = create_model(
378
+ model_name,
379
+ pretrained,
380
+ precision=precision,
381
+ device=device,
382
+ jit=jit,
383
+ force_quick_gelu=force_quick_gelu,
384
+ force_custom_clip=force_custom_clip,
385
+ force_patch_dropout=force_patch_dropout,
386
+ pretrained_image=pretrained_image,
387
+ pretrained_text=pretrained_text,
388
+ pretrained_hf=pretrained_hf,
389
+ pretrained_visual_model=pretrained_visual_model,
390
+ pretrained_text_model=pretrained_text_model,
391
+ cache_dir=cache_dir,
392
+ skip_list=skip_list,
393
+ )
394
+
395
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
396
+ image_std = image_std or getattr(model.visual, 'image_std', None)
397
+ preprocess_train = image_transform(
398
+ model.visual.image_size,
399
+ is_train=True,
400
+ mean=image_mean,
401
+ std=image_std
402
+ )
403
+ preprocess_val = image_transform(
404
+ model.visual.image_size,
405
+ is_train=False,
406
+ mean=image_mean,
407
+ std=image_std
408
+ )
409
+
410
+ return model, preprocess_train, preprocess_val
411
+
412
+
413
+ def create_transforms(
414
+ model_name: str,
415
+ pretrained: Optional[str] = None,
416
+ precision: str = 'fp32',
417
+ device: Union[str, torch.device] = 'cpu',
418
+ jit: bool = False,
419
+ force_quick_gelu: bool = False,
420
+ force_custom_clip: bool = False,
421
+ force_patch_dropout: Optional[float] = None,
422
+ pretrained_image: str = '',
423
+ pretrained_text: str = '',
424
+ pretrained_hf: bool = True,
425
+ pretrained_visual_model: str = None,
426
+ pretrained_text_model: str = None,
427
+ image_mean: Optional[Tuple[float, ...]] = None,
428
+ image_std: Optional[Tuple[float, ...]] = None,
429
+ cache_dir: Optional[str] = None,
430
+ skip_list: list = [],
431
+ ):
432
+ model = create_model(
433
+ model_name,
434
+ pretrained,
435
+ precision=precision,
436
+ device=device,
437
+ jit=jit,
438
+ force_quick_gelu=force_quick_gelu,
439
+ force_custom_clip=force_custom_clip,
440
+ force_patch_dropout=force_patch_dropout,
441
+ pretrained_image=pretrained_image,
442
+ pretrained_text=pretrained_text,
443
+ pretrained_hf=pretrained_hf,
444
+ pretrained_visual_model=pretrained_visual_model,
445
+ pretrained_text_model=pretrained_text_model,
446
+ cache_dir=cache_dir,
447
+ skip_list=skip_list,
448
+ )
449
+
450
+
451
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
452
+ image_std = image_std or getattr(model.visual, 'image_std', None)
453
+ preprocess_train = image_transform(
454
+ model.visual.image_size,
455
+ is_train=True,
456
+ mean=image_mean,
457
+ std=image_std
458
+ )
459
+ preprocess_val = image_transform(
460
+ model.visual.image_size,
461
+ is_train=False,
462
+ mean=image_mean,
463
+ std=image_std
464
+ )
465
+ del model
466
+
467
+ return preprocess_train, preprocess_val
468
+
469
+ def create_model_from_pretrained(
470
+ model_name: str,
471
+ pretrained: str,
472
+ precision: str = 'fp32',
473
+ device: Union[str, torch.device] = 'cpu',
474
+ jit: bool = False,
475
+ force_quick_gelu: bool = False,
476
+ force_custom_clip: bool = False,
477
+ force_patch_dropout: Optional[float] = None,
478
+ return_transform: bool = True,
479
+ image_mean: Optional[Tuple[float, ...]] = None,
480
+ image_std: Optional[Tuple[float, ...]] = None,
481
+ cache_dir: Optional[str] = None,
482
+ is_frozen: bool = False,
483
+ ):
484
+ if not is_pretrained_cfg(model_name, pretrained) and not os.path.exists(pretrained):
485
+ raise RuntimeError(
486
+ f'{pretrained} is not a valid pretrained cfg or checkpoint for {model_name}.'
487
+ f' Use open_clip.list_pretrained() to find one.')
488
+
489
+ model = create_model(
490
+ model_name,
491
+ pretrained,
492
+ precision=precision,
493
+ device=device,
494
+ jit=jit,
495
+ force_quick_gelu=force_quick_gelu,
496
+ force_custom_clip=force_custom_clip,
497
+ force_patch_dropout=force_patch_dropout,
498
+ cache_dir=cache_dir,
499
+ )
500
+
501
+ if is_frozen:
502
+ for param in model.parameters():
503
+ param.requires_grad = False
504
+
505
+ if not return_transform:
506
+ return model
507
+
508
+ image_mean = image_mean or getattr(model.visual, 'image_mean', None)
509
+ image_std = image_std or getattr(model.visual, 'image_std', None)
510
+ preprocess = image_transform(
511
+ model.visual.image_size,
512
+ is_train=False,
513
+ mean=image_mean,
514
+ std=image_std
515
+ )
516
+
517
+ return model, preprocess
PuLID_ComfyUI/eva_clip/hf_configs.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HF architecture dict:
2
+ arch_dict = {
3
+ # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
4
+ "roberta": {
5
+ "config_names": {
6
+ "context_length": "max_position_embeddings",
7
+ "vocab_size": "vocab_size",
8
+ "width": "hidden_size",
9
+ "heads": "num_attention_heads",
10
+ "layers": "num_hidden_layers",
11
+ "layer_attr": "layer",
12
+ "token_embeddings_attr": "embeddings"
13
+ },
14
+ "pooler": "mean_pooler",
15
+ },
16
+ # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
17
+ "xlm-roberta": {
18
+ "config_names": {
19
+ "context_length": "max_position_embeddings",
20
+ "vocab_size": "vocab_size",
21
+ "width": "hidden_size",
22
+ "heads": "num_attention_heads",
23
+ "layers": "num_hidden_layers",
24
+ "layer_attr": "layer",
25
+ "token_embeddings_attr": "embeddings"
26
+ },
27
+ "pooler": "mean_pooler",
28
+ },
29
+ # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
30
+ "mt5": {
31
+ "config_names": {
32
+ # unlimited seqlen
33
+ # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
34
+ # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
35
+ "context_length": "",
36
+ "vocab_size": "vocab_size",
37
+ "width": "d_model",
38
+ "heads": "num_heads",
39
+ "layers": "num_layers",
40
+ "layer_attr": "block",
41
+ "token_embeddings_attr": "embed_tokens"
42
+ },
43
+ "pooler": "mean_pooler",
44
+ },
45
+ "bert": {
46
+ "config_names": {
47
+ "context_length": "max_position_embeddings",
48
+ "vocab_size": "vocab_size",
49
+ "width": "hidden_size",
50
+ "heads": "num_attention_heads",
51
+ "layers": "num_hidden_layers",
52
+ "layer_attr": "layer",
53
+ "token_embeddings_attr": "embeddings"
54
+ },
55
+ "pooler": "mean_pooler",
56
+ }
57
+ }
PuLID_ComfyUI/eva_clip/hf_model.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ huggingface model adapter
2
+
3
+ Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
4
+ """
5
+
6
+ import re
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+ from torch import TensorType
12
+ try:
13
+ import transformers
14
+ from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, AutoConfig, PretrainedConfig
15
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
16
+ BaseModelOutputWithPoolingAndCrossAttentions
17
+ except ImportError as e:
18
+ transformers = None
19
+
20
+
21
+ class BaseModelOutput:
22
+ pass
23
+
24
+
25
+ class PretrainedConfig:
26
+ pass
27
+
28
+ from .hf_configs import arch_dict
29
+
30
+ # utils
31
+ def _camel2snake(s):
32
+ return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
33
+
34
+ # TODO: ?last - for gpt-like models
35
+ _POOLERS = {}
36
+
37
+ def register_pooler(cls):
38
+ """Decorator registering pooler class"""
39
+ _POOLERS[_camel2snake(cls.__name__)] = cls
40
+ return cls
41
+
42
+
43
+ @register_pooler
44
+ class MeanPooler(nn.Module):
45
+ """Mean pooling"""
46
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
47
+ masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
48
+ return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
49
+
50
+ @register_pooler
51
+ class MaxPooler(nn.Module):
52
+ """Max pooling"""
53
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
54
+ masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
55
+ return masked_output.max(1).values
56
+
57
+ @register_pooler
58
+ class ClsPooler(nn.Module):
59
+ """CLS token pooling"""
60
+ def __init__(self, use_pooler_output=True):
61
+ super().__init__()
62
+ self.cls_token_position = 0
63
+ self.use_pooler_output = use_pooler_output
64
+
65
+ def forward(self, x:BaseModelOutput, attention_mask:TensorType):
66
+
67
+ if (self.use_pooler_output and
68
+ isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
69
+ (x.pooler_output is not None)
70
+ ):
71
+ return x.pooler_output
72
+
73
+ return x.last_hidden_state[:, self.cls_token_position, :]
74
+
75
+ class HFTextEncoder(nn.Module):
76
+ """HuggingFace model adapter"""
77
+ def __init__(
78
+ self,
79
+ model_name_or_path: str,
80
+ output_dim: int,
81
+ tokenizer_name: str = None,
82
+ config: PretrainedConfig = None,
83
+ pooler_type: str = None,
84
+ proj: str = None,
85
+ pretrained: bool = True,
86
+ masked_language_modeling: bool = False):
87
+ super().__init__()
88
+
89
+ self.output_dim = output_dim
90
+
91
+ # TODO: find better way to get this information
92
+ uses_transformer_pooler = (pooler_type == "cls_pooler")
93
+
94
+ if transformers is None:
95
+ raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
96
+ if config is None:
97
+ self.config = AutoConfig.from_pretrained(model_name_or_path)
98
+ if masked_language_modeling:
99
+ create_func, model_args = (AutoModelForMaskedLM.from_pretrained, model_name_or_path) if pretrained else (
100
+ AutoModelForMaskedLM.from_config, self.config)
101
+ else:
102
+ create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
103
+ AutoModel.from_config, self.config)
104
+ # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
105
+ if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
106
+ self.transformer = create_func(model_args)
107
+ self.transformer = self.transformer.encoder
108
+ else:
109
+ self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
110
+ else:
111
+ self.config = config
112
+ if masked_language_modeling:
113
+ self.transformer = AutoModelForMaskedLM.from_config(config)
114
+ else:
115
+ self.transformer = AutoModel.from_config(config)
116
+
117
+ if pooler_type is None: # get default arch pooler
118
+ self.pooler = _POOLERS[(arch_dict[self.config.model_type]["pooler"])]()
119
+ else:
120
+ self.pooler = _POOLERS[pooler_type]()
121
+
122
+ d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
123
+ if (d_model == output_dim) and (proj is None): # do we always need a proj?
124
+ self.proj = nn.Identity()
125
+ elif proj == 'linear':
126
+ self.proj = nn.Linear(d_model, output_dim, bias=False)
127
+ elif proj == 'mlp':
128
+ hidden_size = (d_model + output_dim) // 2
129
+ self.proj = nn.Sequential(
130
+ nn.Linear(d_model, hidden_size, bias=False),
131
+ nn.GELU(),
132
+ nn.Linear(hidden_size, output_dim, bias=False),
133
+ )
134
+
135
+ # self.itm_proj = nn.Linear(d_model, 2, bias=False)
136
+ # self.mlm_proj = nn.Linear(d_model, self.config.vocab_size), bias=False)
137
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
138
+
139
+ # def forward_itm(self, x:TensorType, image_embeds:TensorType) -> TensorType:
140
+ # image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(x.device)
141
+ # attn_mask = (x != self.config.pad_token_id).long()
142
+ # out = self.transformer(
143
+ # input_ids=x,
144
+ # attention_mask=attn_mask,
145
+ # encoder_hidden_states = image_embeds,
146
+ # encoder_attention_mask = image_atts,
147
+ # )
148
+ # pooled_out = self.pooler(out, attn_mask)
149
+
150
+ # return self.itm_proj(pooled_out)
151
+
152
+ def mask(self, input_ids, vocab_size, device, targets=None, masked_indices=None, probability_matrix=None):
153
+ if masked_indices is None:
154
+ masked_indices = torch.bernoulli(probability_matrix).bool()
155
+
156
+ masked_indices[input_ids == self.tokenizer.pad_token_id] = False
157
+ masked_indices[input_ids == self.tokenizer.cls_token_id] = False
158
+
159
+ if targets is not None:
160
+ targets[~masked_indices] = -100 # We only compute loss on masked tokens
161
+
162
+ # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
163
+ indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
164
+ input_ids[indices_replaced] = self.tokenizer.mask_token_id
165
+
166
+ # 10% of the time, we replace masked input tokens with random word
167
+ indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
168
+ random_words = torch.randint(vocab_size, input_ids.shape, dtype=torch.long).to(device)
169
+ input_ids[indices_random] = random_words[indices_random]
170
+ # The rest of the time (10% of the time) we keep the masked input tokens unchanged
171
+
172
+ if targets is not None:
173
+ return input_ids, targets
174
+ else:
175
+ return input_ids
176
+
177
+ def forward_mlm(self, input_ids, image_embeds, mlm_probability=0.25):
178
+ labels = input_ids.clone()
179
+ attn_mask = (input_ids != self.config.pad_token_id).long()
180
+ image_atts = torch.ones(image_embeds.size()[:-1],dtype=torch.long).to(input_ids.device)
181
+ vocab_size = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["vocab_size"])
182
+ probability_matrix = torch.full(labels.shape, mlm_probability)
183
+ input_ids, labels = self.mask(input_ids, vocab_size, input_ids.device, targets=labels,
184
+ probability_matrix = probability_matrix)
185
+ mlm_output = self.transformer(input_ids,
186
+ attention_mask = attn_mask,
187
+ encoder_hidden_states = image_embeds,
188
+ encoder_attention_mask = image_atts,
189
+ return_dict = True,
190
+ labels = labels,
191
+ )
192
+ return mlm_output.loss
193
+ # mlm_output = self.transformer(input_ids,
194
+ # attention_mask = attn_mask,
195
+ # encoder_hidden_states = image_embeds,
196
+ # encoder_attention_mask = image_atts,
197
+ # return_dict = True,
198
+ # ).last_hidden_state
199
+ # logits = self.mlm_proj(mlm_output)
200
+
201
+ # # logits = logits[:, :-1, :].contiguous().view(-1, vocab_size)
202
+ # logits = logits[:, 1:, :].contiguous().view(-1, vocab_size)
203
+ # labels = labels[:, 1:].contiguous().view(-1)
204
+
205
+ # mlm_loss = F.cross_entropy(
206
+ # logits,
207
+ # labels,
208
+ # # label_smoothing=0.1,
209
+ # )
210
+ # return mlm_loss
211
+
212
+
213
+ def forward(self, x:TensorType) -> TensorType:
214
+ attn_mask = (x != self.config.pad_token_id).long()
215
+ out = self.transformer(input_ids=x, attention_mask=attn_mask)
216
+ pooled_out = self.pooler(out, attn_mask)
217
+
218
+ return self.proj(pooled_out)
219
+
220
+ def lock(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
221
+ if not unlocked_layers: # full freezing
222
+ for n, p in self.transformer.named_parameters():
223
+ p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
224
+ return
225
+
226
+ encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
227
+ layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
228
+ print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
229
+ embeddings = getattr(
230
+ self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
231
+ modules = [embeddings, *layer_list][:-unlocked_layers]
232
+ # freeze layers
233
+ for module in modules:
234
+ for n, p in module.named_parameters():
235
+ p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
236
+
237
+
238
+ @torch.jit.ignore
239
+ def set_grad_checkpointing(self, enable=True):
240
+ self.transformer.gradient_checkpointing_enable()
241
+
242
+ def get_num_layers(self):
243
+ encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
244
+ layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
245
+ return len(layer_list)
246
+
247
+ def init_parameters(self):
248
+ pass
PuLID_ComfyUI/eva_clip/loss.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import functional as F
5
+
6
+ try:
7
+ import torch.distributed.nn
8
+ from torch import distributed as dist
9
+ has_distributed = True
10
+ except ImportError:
11
+ has_distributed = False
12
+
13
+ try:
14
+ import horovod.torch as hvd
15
+ except ImportError:
16
+ hvd = None
17
+
18
+ from timm.loss import LabelSmoothingCrossEntropy
19
+
20
+
21
+ def gather_features(
22
+ image_features,
23
+ text_features,
24
+ local_loss=False,
25
+ gather_with_grad=False,
26
+ rank=0,
27
+ world_size=1,
28
+ use_horovod=False
29
+ ):
30
+ assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
31
+ if use_horovod:
32
+ assert hvd is not None, 'Please install horovod'
33
+ if gather_with_grad:
34
+ all_image_features = hvd.allgather(image_features)
35
+ all_text_features = hvd.allgather(text_features)
36
+ else:
37
+ with torch.no_grad():
38
+ all_image_features = hvd.allgather(image_features)
39
+ all_text_features = hvd.allgather(text_features)
40
+ if not local_loss:
41
+ # ensure grads for local rank when all_* features don't have a gradient
42
+ gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
43
+ gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
44
+ gathered_image_features[rank] = image_features
45
+ gathered_text_features[rank] = text_features
46
+ all_image_features = torch.cat(gathered_image_features, dim=0)
47
+ all_text_features = torch.cat(gathered_text_features, dim=0)
48
+ else:
49
+ # We gather tensors from all gpus
50
+ if gather_with_grad:
51
+ all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
52
+ all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
53
+ # all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features, async_op=True), dim=0)
54
+ # all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features, async_op=True), dim=0)
55
+ else:
56
+ gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
57
+ gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
58
+ dist.all_gather(gathered_image_features, image_features)
59
+ dist.all_gather(gathered_text_features, text_features)
60
+ if not local_loss:
61
+ # ensure grads for local rank when all_* features don't have a gradient
62
+ gathered_image_features[rank] = image_features
63
+ gathered_text_features[rank] = text_features
64
+ all_image_features = torch.cat(gathered_image_features, dim=0)
65
+ all_text_features = torch.cat(gathered_text_features, dim=0)
66
+
67
+ return all_image_features, all_text_features
68
+
69
+
70
+ class ClipLoss(nn.Module):
71
+
72
+ def __init__(
73
+ self,
74
+ local_loss=False,
75
+ gather_with_grad=False,
76
+ cache_labels=False,
77
+ rank=0,
78
+ world_size=1,
79
+ use_horovod=False,
80
+ smoothing=0.,
81
+ ):
82
+ super().__init__()
83
+ self.local_loss = local_loss
84
+ self.gather_with_grad = gather_with_grad
85
+ self.cache_labels = cache_labels
86
+ self.rank = rank
87
+ self.world_size = world_size
88
+ self.use_horovod = use_horovod
89
+ self.label_smoothing_cross_entropy = LabelSmoothingCrossEntropy(smoothing=smoothing) if smoothing > 0 else None
90
+
91
+ # cache state
92
+ self.prev_num_logits = 0
93
+ self.labels = {}
94
+
95
+ def forward(self, image_features, text_features, logit_scale=1.):
96
+ device = image_features.device
97
+ if self.world_size > 1:
98
+ all_image_features, all_text_features = gather_features(
99
+ image_features, text_features,
100
+ self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
101
+
102
+ if self.local_loss:
103
+ logits_per_image = logit_scale * image_features @ all_text_features.T
104
+ logits_per_text = logit_scale * text_features @ all_image_features.T
105
+ else:
106
+ logits_per_image = logit_scale * all_image_features @ all_text_features.T
107
+ logits_per_text = logits_per_image.T
108
+ else:
109
+ logits_per_image = logit_scale * image_features @ text_features.T
110
+ logits_per_text = logit_scale * text_features @ image_features.T
111
+ # calculated ground-truth and cache if enabled
112
+ num_logits = logits_per_image.shape[0]
113
+ if self.prev_num_logits != num_logits or device not in self.labels:
114
+ labels = torch.arange(num_logits, device=device, dtype=torch.long)
115
+ if self.world_size > 1 and self.local_loss:
116
+ labels = labels + num_logits * self.rank
117
+ if self.cache_labels:
118
+ self.labels[device] = labels
119
+ self.prev_num_logits = num_logits
120
+ else:
121
+ labels = self.labels[device]
122
+
123
+ if self.label_smoothing_cross_entropy:
124
+ total_loss = (
125
+ self.label_smoothing_cross_entropy(logits_per_image, labels) +
126
+ self.label_smoothing_cross_entropy(logits_per_text, labels)
127
+ ) / 2
128
+ else:
129
+ total_loss = (
130
+ F.cross_entropy(logits_per_image, labels) +
131
+ F.cross_entropy(logits_per_text, labels)
132
+ ) / 2
133
+
134
+ acc = None
135
+ i2t_acc = (logits_per_image.argmax(-1) == labels).sum() / len(logits_per_image)
136
+ t2i_acc = (logits_per_text.argmax(-1) == labels).sum() / len(logits_per_text)
137
+ acc = {"i2t": i2t_acc, "t2i": t2i_acc}
138
+ return total_loss, acc
PuLID_ComfyUI/eva_clip/model.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ CLIP Model
2
+
3
+ Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+ import os
6
+ from dataclasses import dataclass
7
+ from typing import Optional, Tuple, Union
8
+ from functools import partial
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.nn.functional as F
13
+ from torch import nn
14
+
15
+ try:
16
+ from .hf_model import HFTextEncoder
17
+ except:
18
+ HFTextEncoder = None
19
+ from .modified_resnet import ModifiedResNet
20
+ from .timm_model import TimmModel
21
+ from .eva_vit_model import EVAVisionTransformer
22
+ from .transformer import LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer
23
+
24
+ try:
25
+ from apex.normalization import FusedLayerNorm
26
+ except:
27
+ FusedLayerNorm = LayerNorm
28
+ print("Nvidia APEX normalization not installed, using PyTorch LayerNorm")
29
+
30
+ try:
31
+ import xformers.ops as xops
32
+ except ImportError:
33
+ xops = None
34
+ #print("Please 'pip install xformers'")
35
+
36
+ @dataclass
37
+ class CLIPVisionCfg:
38
+ layers: Union[Tuple[int, int, int, int], int] = 12
39
+ width: int = 768
40
+ head_width: int = 64
41
+ mlp_ratio: float = 4.0
42
+ patch_size: int = 16
43
+ image_size: Union[Tuple[int, int], int] = 224
44
+ ls_init_value: Optional[float] = None # layer scale initial value
45
+ patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
46
+ global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
47
+ drop_path_rate: Optional[float] = None # drop path rate
48
+ timm_model_name: str = None # a valid model name overrides layers, width, patch_size
49
+ timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model
50
+ timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
51
+ timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '')
52
+ timm_proj_bias: bool = False # enable bias final projection
53
+ eva_model_name: str = None # a valid eva model name overrides layers, width, patch_size
54
+ qkv_bias: bool = True
55
+ fusedLN: bool = False
56
+ xattn: bool = False
57
+ postnorm: bool = False
58
+ rope: bool = False
59
+ pt_hw_seq_len: int = 16 # 224/14
60
+ intp_freq: bool = False
61
+ naiveswiglu: bool = False
62
+ subln: bool = False
63
+
64
+
65
+ @dataclass
66
+ class CLIPTextCfg:
67
+ context_length: int = 77
68
+ vocab_size: int = 49408
69
+ width: int = 512
70
+ heads: int = 8
71
+ layers: int = 12
72
+ ls_init_value: Optional[float] = None # layer scale initial value
73
+ hf_model_name: str = None
74
+ hf_tokenizer_name: str = None
75
+ hf_model_pretrained: bool = True
76
+ proj: str = 'mlp'
77
+ pooler_type: str = 'mean_pooler'
78
+ masked_language_modeling: bool = False
79
+ fusedLN: bool = False
80
+ xattn: bool = False
81
+ attn_mask: bool = True
82
+
83
+ def get_cast_dtype(precision: str):
84
+ cast_dtype = None
85
+ if precision == 'bf16':
86
+ cast_dtype = torch.bfloat16
87
+ elif precision == 'fp16':
88
+ cast_dtype = torch.float16
89
+ return cast_dtype
90
+
91
+
92
+ def _build_vision_tower(
93
+ embed_dim: int,
94
+ vision_cfg: CLIPVisionCfg,
95
+ quick_gelu: bool = False,
96
+ cast_dtype: Optional[torch.dtype] = None
97
+ ):
98
+ if isinstance(vision_cfg, dict):
99
+ vision_cfg = CLIPVisionCfg(**vision_cfg)
100
+
101
+ # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
102
+ # memory efficient in recent PyTorch releases (>= 1.10).
103
+ # NOTE: timm models always use native GELU regardless of quick_gelu flag.
104
+ act_layer = QuickGELU if quick_gelu else nn.GELU
105
+
106
+ if vision_cfg.eva_model_name:
107
+ vision_heads = vision_cfg.width // vision_cfg.head_width
108
+ norm_layer = LayerNorm
109
+
110
+ visual = EVAVisionTransformer(
111
+ img_size=vision_cfg.image_size,
112
+ patch_size=vision_cfg.patch_size,
113
+ num_classes=embed_dim,
114
+ use_mean_pooling=vision_cfg.global_average_pool, #False
115
+ init_values=vision_cfg.ls_init_value,
116
+ patch_dropout=vision_cfg.patch_dropout,
117
+ embed_dim=vision_cfg.width,
118
+ depth=vision_cfg.layers,
119
+ num_heads=vision_heads,
120
+ mlp_ratio=vision_cfg.mlp_ratio,
121
+ qkv_bias=vision_cfg.qkv_bias,
122
+ drop_path_rate=vision_cfg.drop_path_rate,
123
+ norm_layer= partial(FusedLayerNorm, eps=1e-6) if vision_cfg.fusedLN else partial(norm_layer, eps=1e-6),
124
+ xattn=vision_cfg.xattn,
125
+ rope=vision_cfg.rope,
126
+ postnorm=vision_cfg.postnorm,
127
+ pt_hw_seq_len= vision_cfg.pt_hw_seq_len, # 224/14
128
+ intp_freq= vision_cfg.intp_freq,
129
+ naiveswiglu= vision_cfg.naiveswiglu,
130
+ subln= vision_cfg.subln
131
+ )
132
+ elif vision_cfg.timm_model_name:
133
+ visual = TimmModel(
134
+ vision_cfg.timm_model_name,
135
+ pretrained=vision_cfg.timm_model_pretrained,
136
+ pool=vision_cfg.timm_pool,
137
+ proj=vision_cfg.timm_proj,
138
+ proj_bias=vision_cfg.timm_proj_bias,
139
+ embed_dim=embed_dim,
140
+ image_size=vision_cfg.image_size
141
+ )
142
+ act_layer = nn.GELU # so that text transformer doesn't use QuickGELU w/ timm models
143
+ elif isinstance(vision_cfg.layers, (tuple, list)):
144
+ vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
145
+ visual = ModifiedResNet(
146
+ layers=vision_cfg.layers,
147
+ output_dim=embed_dim,
148
+ heads=vision_heads,
149
+ image_size=vision_cfg.image_size,
150
+ width=vision_cfg.width
151
+ )
152
+ else:
153
+ vision_heads = vision_cfg.width // vision_cfg.head_width
154
+ norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
155
+ visual = VisionTransformer(
156
+ image_size=vision_cfg.image_size,
157
+ patch_size=vision_cfg.patch_size,
158
+ width=vision_cfg.width,
159
+ layers=vision_cfg.layers,
160
+ heads=vision_heads,
161
+ mlp_ratio=vision_cfg.mlp_ratio,
162
+ ls_init_value=vision_cfg.ls_init_value,
163
+ patch_dropout=vision_cfg.patch_dropout,
164
+ global_average_pool=vision_cfg.global_average_pool,
165
+ output_dim=embed_dim,
166
+ act_layer=act_layer,
167
+ norm_layer=norm_layer,
168
+ )
169
+
170
+ return visual
171
+
172
+
173
+ def _build_text_tower(
174
+ embed_dim: int,
175
+ text_cfg: CLIPTextCfg,
176
+ quick_gelu: bool = False,
177
+ cast_dtype: Optional[torch.dtype] = None,
178
+ ):
179
+ if isinstance(text_cfg, dict):
180
+ text_cfg = CLIPTextCfg(**text_cfg)
181
+
182
+ if text_cfg.hf_model_name:
183
+ text = HFTextEncoder(
184
+ text_cfg.hf_model_name,
185
+ output_dim=embed_dim,
186
+ tokenizer_name=text_cfg.hf_tokenizer_name,
187
+ proj=text_cfg.proj,
188
+ pooler_type=text_cfg.pooler_type,
189
+ masked_language_modeling=text_cfg.masked_language_modeling
190
+ )
191
+ else:
192
+ act_layer = QuickGELU if quick_gelu else nn.GELU
193
+ norm_layer = LayerNorm
194
+
195
+ text = TextTransformer(
196
+ context_length=text_cfg.context_length,
197
+ vocab_size=text_cfg.vocab_size,
198
+ width=text_cfg.width,
199
+ heads=text_cfg.heads,
200
+ layers=text_cfg.layers,
201
+ ls_init_value=text_cfg.ls_init_value,
202
+ output_dim=embed_dim,
203
+ act_layer=act_layer,
204
+ norm_layer= FusedLayerNorm if text_cfg.fusedLN else norm_layer,
205
+ xattn=text_cfg.xattn,
206
+ attn_mask=text_cfg.attn_mask,
207
+ )
208
+ return text
209
+
210
+ class CLIP(nn.Module):
211
+ def __init__(
212
+ self,
213
+ embed_dim: int,
214
+ vision_cfg: CLIPVisionCfg,
215
+ text_cfg: CLIPTextCfg,
216
+ quick_gelu: bool = False,
217
+ cast_dtype: Optional[torch.dtype] = None,
218
+ ):
219
+ super().__init__()
220
+ self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
221
+
222
+ text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
223
+ self.transformer = text.transformer
224
+ self.vocab_size = text.vocab_size
225
+ self.token_embedding = text.token_embedding
226
+ self.positional_embedding = text.positional_embedding
227
+ self.ln_final = text.ln_final
228
+ self.text_projection = text.text_projection
229
+ self.register_buffer('attn_mask', text.attn_mask, persistent=False)
230
+
231
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
232
+
233
+ def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
234
+ # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
235
+ self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
236
+
237
+ @torch.jit.ignore
238
+ def set_grad_checkpointing(self, enable=True):
239
+ self.visual.set_grad_checkpointing(enable)
240
+ self.transformer.grad_checkpointing = enable
241
+
242
+ @torch.jit.ignore
243
+ def no_weight_decay(self):
244
+ return {'logit_scale'}
245
+
246
+ def encode_image(self, image, normalize: bool = False):
247
+ features = self.visual(image)
248
+ return F.normalize(features, dim=-1) if normalize else features
249
+
250
+ def encode_text(self, text, normalize: bool = False):
251
+ cast_dtype = self.transformer.get_cast_dtype()
252
+
253
+ x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]
254
+
255
+ x = x + self.positional_embedding.to(cast_dtype)
256
+ x = x.permute(1, 0, 2) # NLD -> LND
257
+ x = self.transformer(x, attn_mask=self.attn_mask)
258
+ x = x.permute(1, 0, 2) # LND -> NLD
259
+ x = self.ln_final(x) # [batch_size, n_ctx, transformer.width]
260
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
261
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
262
+ return F.normalize(x, dim=-1) if normalize else x
263
+
264
+ def forward(self, image, text):
265
+ image_features = self.encode_image(image, normalize=True)
266
+ text_features = self.encode_text(text, normalize=True)
267
+ return image_features, text_features, self.logit_scale.exp()
268
+
269
+
270
+ class CustomCLIP(nn.Module):
271
+ def __init__(
272
+ self,
273
+ embed_dim: int,
274
+ vision_cfg: CLIPVisionCfg,
275
+ text_cfg: CLIPTextCfg,
276
+ quick_gelu: bool = False,
277
+ cast_dtype: Optional[torch.dtype] = None,
278
+ itm_task: bool = False,
279
+ ):
280
+ super().__init__()
281
+ self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
282
+ self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
283
+ self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
284
+
285
+ def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
286
+ # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
287
+ self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
288
+
289
+ def lock_text_tower(self, unlocked_layers:int=0, freeze_layer_norm:bool=True):
290
+ self.text.lock(unlocked_layers, freeze_layer_norm)
291
+
292
+ @torch.jit.ignore
293
+ def set_grad_checkpointing(self, enable=True):
294
+ self.visual.set_grad_checkpointing(enable)
295
+ self.text.set_grad_checkpointing(enable)
296
+
297
+ @torch.jit.ignore
298
+ def no_weight_decay(self):
299
+ return {'logit_scale'}
300
+
301
+ def encode_image(self, image, normalize: bool = False):
302
+ features = self.visual(image)
303
+ return F.normalize(features, dim=-1) if normalize else features
304
+
305
+ def encode_text(self, text, normalize: bool = False):
306
+ features = self.text(text)
307
+ return F.normalize(features, dim=-1) if normalize else features
308
+
309
+ def forward(self, image, text):
310
+ image_features = self.encode_image(image, normalize=True)
311
+ text_features = self.encode_text(text, normalize=True)
312
+ return image_features, text_features, self.logit_scale.exp()
313
+
314
+
315
+ def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
316
+ """Convert applicable model parameters to low-precision (bf16 or fp16)"""
317
+
318
+ def _convert_weights(l):
319
+
320
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
321
+ l.weight.data = l.weight.data.to(dtype)
322
+ if l.bias is not None:
323
+ l.bias.data = l.bias.data.to(dtype)
324
+
325
+ if isinstance(l, (nn.MultiheadAttention, Attention)):
326
+ for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
327
+ tensor = getattr(l, attr, None)
328
+ if tensor is not None:
329
+ tensor.data = tensor.data.to(dtype)
330
+
331
+ if isinstance(l, nn.Parameter):
332
+ l.data = l.data.to(dtype)
333
+
334
+ for name in ["text_projection", "proj"]:
335
+ if hasattr(l, name) and isinstance(l, nn.Parameter):
336
+ attr = getattr(l, name, None)
337
+ if attr is not None:
338
+ attr.data = attr.data.to(dtype)
339
+
340
+ model.apply(_convert_weights)
341
+
342
+
343
+ convert_weights_to_fp16 = convert_weights_to_lp # backwards compat
344
+
345
+
346
+ # used to maintain checkpoint compatibility
347
+ def convert_to_custom_text_state_dict(state_dict: dict):
348
+ if 'text_projection' in state_dict:
349
+ # old format state_dict, move text tower -> .text
350
+ new_state_dict = {}
351
+ for k, v in state_dict.items():
352
+ if any(k.startswith(p) for p in (
353
+ 'text_projection',
354
+ 'positional_embedding',
355
+ 'token_embedding',
356
+ 'transformer',
357
+ 'ln_final',
358
+ 'logit_scale'
359
+ )):
360
+ k = 'text.' + k
361
+ new_state_dict[k] = v
362
+ return new_state_dict
363
+ return state_dict
364
+
365
+
366
+ def build_model_from_openai_state_dict(
367
+ state_dict: dict,
368
+ quick_gelu=True,
369
+ cast_dtype=torch.float16,
370
+ ):
371
+ vit = "visual.proj" in state_dict
372
+
373
+ if vit:
374
+ vision_width = state_dict["visual.conv1.weight"].shape[0]
375
+ vision_layers = len(
376
+ [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
377
+ vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
378
+ grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
379
+ image_size = vision_patch_size * grid_size
380
+ else:
381
+ counts: list = [
382
+ len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
383
+ vision_layers = tuple(counts)
384
+ vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
385
+ output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
386
+ vision_patch_size = None
387
+ assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
388
+ image_size = output_width * 32
389
+
390
+ embed_dim = state_dict["text_projection"].shape[1]
391
+ context_length = state_dict["positional_embedding"].shape[0]
392
+ vocab_size = state_dict["token_embedding.weight"].shape[0]
393
+ transformer_width = state_dict["ln_final.weight"].shape[0]
394
+ transformer_heads = transformer_width // 64
395
+ transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
396
+
397
+ vision_cfg = CLIPVisionCfg(
398
+ layers=vision_layers,
399
+ width=vision_width,
400
+ patch_size=vision_patch_size,
401
+ image_size=image_size,
402
+ )
403
+ text_cfg = CLIPTextCfg(
404
+ context_length=context_length,
405
+ vocab_size=vocab_size,
406
+ width=transformer_width,
407
+ heads=transformer_heads,
408
+ layers=transformer_layers
409
+ )
410
+ model = CLIP(
411
+ embed_dim,
412
+ vision_cfg=vision_cfg,
413
+ text_cfg=text_cfg,
414
+ quick_gelu=quick_gelu, # OpenAI models were trained with QuickGELU
415
+ cast_dtype=cast_dtype,
416
+ )
417
+
418
+ for key in ["input_resolution", "context_length", "vocab_size"]:
419
+ state_dict.pop(key, None)
420
+
421
+ convert_weights_to_fp16(model) # OpenAI state dicts are partially converted to float16
422
+ model.load_state_dict(state_dict)
423
+ return model.eval()
424
+
425
+
426
+ def trace_model(model, batch_size=256, device=torch.device('cpu')):
427
+ model.eval()
428
+ image_size = model.visual.image_size
429
+ example_images = torch.ones((batch_size, 3, image_size, image_size), device=device)
430
+ example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
431
+ model = torch.jit.trace_module(
432
+ model,
433
+ inputs=dict(
434
+ forward=(example_images, example_text),
435
+ encode_text=(example_text,),
436
+ encode_image=(example_images,)
437
+ ))
438
+ model.visual.image_size = image_size
439
+ return model
PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-B-16.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "patch_size": 16,
8
+ "eva_model_name": "eva-clip-b-16",
9
+ "ls_init_value": 0.1,
10
+ "drop_path_rate": 0.0
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12
18
+ }
19
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 40,
6
+ "width": 1408,
7
+ "head_width": 88,
8
+ "mlp_ratio": 4.3637,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-g-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "fusedLN": true
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 1024,
19
+ "heads": 16,
20
+ "layers": 24,
21
+ "xattn": false,
22
+ "fusedLN": true
23
+ }
24
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA01-CLIP-g-14.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 40,
6
+ "width": 1408,
7
+ "head_width": 88,
8
+ "mlp_ratio": 4.3637,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-g-14-x",
11
+ "drop_path_rate": 0.4,
12
+ "xattn": true,
13
+ "fusedLN": true
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 768,
19
+ "heads": 12,
20
+ "layers": 12,
21
+ "xattn": false,
22
+ "fusedLN": true
23
+ }
24
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-B-16.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "head_width": 64,
8
+ "patch_size": 16,
9
+ "mlp_ratio": 2.6667,
10
+ "eva_model_name": "eva-clip-b-16-X",
11
+ "drop_path_rate": 0.0,
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 512,
24
+ "heads": 8,
25
+ "layers": 12,
26
+ "xattn": true,
27
+ "fusedLN": true
28
+ }
29
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-L-14-336.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 336,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "drop_path_rate": 0,
8
+ "head_width": 64,
9
+ "mlp_ratio": 2.6667,
10
+ "patch_size": 14,
11
+ "eva_model_name": "eva-clip-l-14-336",
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 768,
24
+ "heads": 12,
25
+ "layers": 12,
26
+ "xattn": false,
27
+ "fusedLN": true
28
+ }
29
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-L-14.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "drop_path_rate": 0,
8
+ "head_width": 64,
9
+ "mlp_ratio": 2.6667,
10
+ "patch_size": 14,
11
+ "eva_model_name": "eva-clip-l-14",
12
+ "xattn": true,
13
+ "fusedLN": true,
14
+ "rope": true,
15
+ "pt_hw_seq_len": 16,
16
+ "intp_freq": true,
17
+ "naiveswiglu": true,
18
+ "subln": true
19
+ },
20
+ "text_cfg": {
21
+ "context_length": 77,
22
+ "vocab_size": 49408,
23
+ "width": 768,
24
+ "heads": 12,
25
+ "layers": 12,
26
+ "xattn": false,
27
+ "fusedLN": true
28
+ }
29
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 64,
6
+ "width": 1792,
7
+ "head_width": 112,
8
+ "mlp_ratio": 8.571428571428571,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-4b-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "postnorm": true,
14
+ "fusedLN": true
15
+ },
16
+ "text_cfg": {
17
+ "context_length": 77,
18
+ "vocab_size": 49408,
19
+ "width": 1280,
20
+ "heads": 20,
21
+ "layers": 32,
22
+ "xattn": false,
23
+ "fusedLN": true
24
+ }
25
+ }
PuLID_ComfyUI/eva_clip/model_configs/EVA02-CLIP-bigE-14.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 64,
6
+ "width": 1792,
7
+ "head_width": 112,
8
+ "mlp_ratio": 8.571428571428571,
9
+ "patch_size": 14,
10
+ "eva_model_name": "eva-clip-4b-14-x",
11
+ "drop_path_rate": 0,
12
+ "xattn": true,
13
+ "postnorm": true,
14
+ "fusedLN": true
15
+ },
16
+ "text_cfg": {
17
+ "context_length": 77,
18
+ "vocab_size": 49408,
19
+ "width": 1024,
20
+ "heads": 16,
21
+ "layers": 24,
22
+ "xattn": false,
23
+ "fusedLN": true
24
+ }
25
+ }
PuLID_ComfyUI/eva_clip/modified_resnet.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ from .utils import freeze_batch_norm_2d
8
+
9
+
10
+ class Bottleneck(nn.Module):
11
+ expansion = 4
12
+
13
+ def __init__(self, inplanes, planes, stride=1):
14
+ super().__init__()
15
+
16
+ # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
17
+ self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
18
+ self.bn1 = nn.BatchNorm2d(planes)
19
+ self.act1 = nn.ReLU(inplace=True)
20
+
21
+ self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
22
+ self.bn2 = nn.BatchNorm2d(planes)
23
+ self.act2 = nn.ReLU(inplace=True)
24
+
25
+ self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
26
+
27
+ self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
28
+ self.bn3 = nn.BatchNorm2d(planes * self.expansion)
29
+ self.act3 = nn.ReLU(inplace=True)
30
+
31
+ self.downsample = None
32
+ self.stride = stride
33
+
34
+ if stride > 1 or inplanes != planes * Bottleneck.expansion:
35
+ # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
36
+ self.downsample = nn.Sequential(OrderedDict([
37
+ ("-1", nn.AvgPool2d(stride)),
38
+ ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
39
+ ("1", nn.BatchNorm2d(planes * self.expansion))
40
+ ]))
41
+
42
+ def forward(self, x: torch.Tensor):
43
+ identity = x
44
+
45
+ out = self.act1(self.bn1(self.conv1(x)))
46
+ out = self.act2(self.bn2(self.conv2(out)))
47
+ out = self.avgpool(out)
48
+ out = self.bn3(self.conv3(out))
49
+
50
+ if self.downsample is not None:
51
+ identity = self.downsample(x)
52
+
53
+ out += identity
54
+ out = self.act3(out)
55
+ return out
56
+
57
+
58
+ class AttentionPool2d(nn.Module):
59
+ def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
60
+ super().__init__()
61
+ self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
62
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
63
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
64
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
65
+ self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
66
+ self.num_heads = num_heads
67
+
68
+ def forward(self, x):
69
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
70
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
71
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
72
+ x, _ = F.multi_head_attention_forward(
73
+ query=x, key=x, value=x,
74
+ embed_dim_to_check=x.shape[-1],
75
+ num_heads=self.num_heads,
76
+ q_proj_weight=self.q_proj.weight,
77
+ k_proj_weight=self.k_proj.weight,
78
+ v_proj_weight=self.v_proj.weight,
79
+ in_proj_weight=None,
80
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
81
+ bias_k=None,
82
+ bias_v=None,
83
+ add_zero_attn=False,
84
+ dropout_p=0.,
85
+ out_proj_weight=self.c_proj.weight,
86
+ out_proj_bias=self.c_proj.bias,
87
+ use_separate_proj_weight=True,
88
+ training=self.training,
89
+ need_weights=False
90
+ )
91
+
92
+ return x[0]
93
+
94
+
95
+ class ModifiedResNet(nn.Module):
96
+ """
97
+ A ResNet class that is similar to torchvision's but contains the following changes:
98
+ - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
99
+ - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
100
+ - The final pooling layer is a QKV attention instead of an average pool
101
+ """
102
+
103
+ def __init__(self, layers, output_dim, heads, image_size=224, width=64):
104
+ super().__init__()
105
+ self.output_dim = output_dim
106
+ self.image_size = image_size
107
+
108
+ # the 3-layer stem
109
+ self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
110
+ self.bn1 = nn.BatchNorm2d(width // 2)
111
+ self.act1 = nn.ReLU(inplace=True)
112
+ self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
113
+ self.bn2 = nn.BatchNorm2d(width // 2)
114
+ self.act2 = nn.ReLU(inplace=True)
115
+ self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
116
+ self.bn3 = nn.BatchNorm2d(width)
117
+ self.act3 = nn.ReLU(inplace=True)
118
+ self.avgpool = nn.AvgPool2d(2)
119
+
120
+ # residual layers
121
+ self._inplanes = width # this is a *mutable* variable used during construction
122
+ self.layer1 = self._make_layer(width, layers[0])
123
+ self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
124
+ self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
125
+ self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
126
+
127
+ embed_dim = width * 32 # the ResNet feature dimension
128
+ self.attnpool = AttentionPool2d(image_size // 32, embed_dim, heads, output_dim)
129
+
130
+ self.init_parameters()
131
+
132
+ def _make_layer(self, planes, blocks, stride=1):
133
+ layers = [Bottleneck(self._inplanes, planes, stride)]
134
+
135
+ self._inplanes = planes * Bottleneck.expansion
136
+ for _ in range(1, blocks):
137
+ layers.append(Bottleneck(self._inplanes, planes))
138
+
139
+ return nn.Sequential(*layers)
140
+
141
+ def init_parameters(self):
142
+ if self.attnpool is not None:
143
+ std = self.attnpool.c_proj.in_features ** -0.5
144
+ nn.init.normal_(self.attnpool.q_proj.weight, std=std)
145
+ nn.init.normal_(self.attnpool.k_proj.weight, std=std)
146
+ nn.init.normal_(self.attnpool.v_proj.weight, std=std)
147
+ nn.init.normal_(self.attnpool.c_proj.weight, std=std)
148
+
149
+ for resnet_block in [self.layer1, self.layer2, self.layer3, self.layer4]:
150
+ for name, param in resnet_block.named_parameters():
151
+ if name.endswith("bn3.weight"):
152
+ nn.init.zeros_(param)
153
+
154
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
155
+ assert unlocked_groups == 0, 'partial locking not currently supported for this model'
156
+ for param in self.parameters():
157
+ param.requires_grad = False
158
+ if freeze_bn_stats:
159
+ freeze_batch_norm_2d(self)
160
+
161
+ @torch.jit.ignore
162
+ def set_grad_checkpointing(self, enable=True):
163
+ # FIXME support for non-transformer
164
+ pass
165
+
166
+ def stem(self, x):
167
+ x = self.act1(self.bn1(self.conv1(x)))
168
+ x = self.act2(self.bn2(self.conv2(x)))
169
+ x = self.act3(self.bn3(self.conv3(x)))
170
+ x = self.avgpool(x)
171
+ return x
172
+
173
+ def forward(self, x):
174
+ x = self.stem(x)
175
+ x = self.layer1(x)
176
+ x = self.layer2(x)
177
+ x = self.layer3(x)
178
+ x = self.layer4(x)
179
+ x = self.attnpool(x)
180
+
181
+ return x
PuLID_ComfyUI/eva_clip/openai.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ OpenAI pretrained model functions
2
+
3
+ Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+
6
+ import os
7
+ import warnings
8
+ from typing import List, Optional, Union
9
+
10
+ import torch
11
+
12
+ from .model import build_model_from_openai_state_dict, convert_weights_to_lp, get_cast_dtype
13
+ from .pretrained import get_pretrained_url, list_pretrained_models_by_tag, download_pretrained_from_url
14
+
15
+ __all__ = ["list_openai_models", "load_openai_model"]
16
+
17
+
18
+ def list_openai_models() -> List[str]:
19
+ """Returns the names of available CLIP models"""
20
+ return list_pretrained_models_by_tag('openai')
21
+
22
+
23
+ def load_openai_model(
24
+ name: str,
25
+ precision: Optional[str] = None,
26
+ device: Optional[Union[str, torch.device]] = None,
27
+ jit: bool = True,
28
+ cache_dir: Optional[str] = None,
29
+ ):
30
+ """Load a CLIP model
31
+
32
+ Parameters
33
+ ----------
34
+ name : str
35
+ A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
36
+ precision: str
37
+ Model precision, if None defaults to 'fp32' if device == 'cpu' else 'fp16'.
38
+ device : Union[str, torch.device]
39
+ The device to put the loaded model
40
+ jit : bool
41
+ Whether to load the optimized JIT model (default) or more hackable non-JIT model.
42
+ cache_dir : Optional[str]
43
+ The directory to cache the downloaded model weights
44
+
45
+ Returns
46
+ -------
47
+ model : torch.nn.Module
48
+ The CLIP model
49
+ preprocess : Callable[[PIL.Image], torch.Tensor]
50
+ A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
51
+ """
52
+ if device is None:
53
+ device = "cuda" if torch.cuda.is_available() else "cpu"
54
+ if precision is None:
55
+ precision = 'fp32' if device == 'cpu' else 'fp16'
56
+
57
+ if get_pretrained_url(name, 'openai'):
58
+ model_path = download_pretrained_from_url(get_pretrained_url(name, 'openai'), cache_dir=cache_dir)
59
+ elif os.path.isfile(name):
60
+ model_path = name
61
+ else:
62
+ raise RuntimeError(f"Model {name} not found; available models = {list_openai_models()}")
63
+
64
+ try:
65
+ # loading JIT archive
66
+ model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
67
+ state_dict = None
68
+ except RuntimeError:
69
+ # loading saved state dict
70
+ if jit:
71
+ warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
72
+ jit = False
73
+ state_dict = torch.load(model_path, map_location="cpu")
74
+
75
+ if not jit:
76
+ # Build a non-jit model from the OpenAI jitted model state dict
77
+ cast_dtype = get_cast_dtype(precision)
78
+ try:
79
+ model = build_model_from_openai_state_dict(state_dict or model.state_dict(), cast_dtype=cast_dtype)
80
+ except KeyError:
81
+ sd = {k[7:]: v for k, v in state_dict["state_dict"].items()}
82
+ model = build_model_from_openai_state_dict(sd, cast_dtype=cast_dtype)
83
+
84
+ # model from OpenAI state dict is in manually cast fp16 mode, must be converted for AMP/fp32/bf16 use
85
+ model = model.to(device)
86
+ if precision.startswith('amp') or precision == 'fp32':
87
+ model.float()
88
+ elif precision == 'bf16':
89
+ convert_weights_to_lp(model, dtype=torch.bfloat16)
90
+
91
+ return model
92
+
93
+ # patch the device names
94
+ device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
95
+ device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
96
+
97
+ def patch_device(module):
98
+ try:
99
+ graphs = [module.graph] if hasattr(module, "graph") else []
100
+ except RuntimeError:
101
+ graphs = []
102
+
103
+ if hasattr(module, "forward1"):
104
+ graphs.append(module.forward1.graph)
105
+
106
+ for graph in graphs:
107
+ for node in graph.findAllNodes("prim::Constant"):
108
+ if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
109
+ node.copyAttributes(device_node)
110
+
111
+ model.apply(patch_device)
112
+ patch_device(model.encode_image)
113
+ patch_device(model.encode_text)
114
+
115
+ # patch dtype to float32 (typically for CPU)
116
+ if precision == 'fp32':
117
+ float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
118
+ float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
119
+ float_node = float_input.node()
120
+
121
+ def patch_float(module):
122
+ try:
123
+ graphs = [module.graph] if hasattr(module, "graph") else []
124
+ except RuntimeError:
125
+ graphs = []
126
+
127
+ if hasattr(module, "forward1"):
128
+ graphs.append(module.forward1.graph)
129
+
130
+ for graph in graphs:
131
+ for node in graph.findAllNodes("aten::to"):
132
+ inputs = list(node.inputs())
133
+ for i in [1, 2]: # dtype can be the second or third argument to aten::to()
134
+ if inputs[i].node()["value"] == 5:
135
+ inputs[i].node().copyAttributes(float_node)
136
+
137
+ model.apply(patch_float)
138
+ patch_float(model.encode_image)
139
+ patch_float(model.encode_text)
140
+ model.float()
141
+
142
+ # ensure image_size attr available at consistent location for both jit and non-jit
143
+ model.visual.image_size = model.input_resolution.item()
144
+ return model
PuLID_ComfyUI/eva_clip/pretrained.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import urllib
4
+ import warnings
5
+ from functools import partial
6
+ from typing import Dict, Union
7
+
8
+ from tqdm import tqdm
9
+
10
+ try:
11
+ from huggingface_hub import hf_hub_download
12
+ _has_hf_hub = True
13
+ except ImportError:
14
+ hf_hub_download = None
15
+ _has_hf_hub = False
16
+
17
+
18
+ def _pcfg(url='', hf_hub='', filename='', mean=None, std=None):
19
+ return dict(
20
+ url=url,
21
+ hf_hub=hf_hub,
22
+ mean=mean,
23
+ std=std,
24
+ )
25
+
26
+ _VITB32 = dict(
27
+ openai=_pcfg(
28
+ "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
29
+ laion400m_e31=_pcfg(
30
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
31
+ laion400m_e32=_pcfg(
32
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
33
+ laion2b_e16=_pcfg(
34
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-laion2b_e16-af8dbd0c.pth"),
35
+ laion2b_s34b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-B-32-laion2B-s34B-b79K/')
36
+ )
37
+
38
+ _VITB32_quickgelu = dict(
39
+ openai=_pcfg(
40
+ "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt"),
41
+ laion400m_e31=_pcfg(
42
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e31-d867053b.pt"),
43
+ laion400m_e32=_pcfg(
44
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_32-quickgelu-laion400m_e32-46683a32.pt"),
45
+ )
46
+
47
+ _VITB16 = dict(
48
+ openai=_pcfg(
49
+ "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt"),
50
+ laion400m_e31=_pcfg(
51
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e31-00efa78f.pt"),
52
+ laion400m_e32=_pcfg(
53
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16-laion400m_e32-55e67d44.pt"),
54
+ laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-B-16-laion2B-s34B-b88K/'),
55
+ )
56
+
57
+ _EVAB16 = dict(
58
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
59
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_B_psz14to16.pt'),
60
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
61
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_B_psz16_s8B.pt'),
62
+ )
63
+
64
+ _VITB16_PLUS_240 = dict(
65
+ laion400m_e31=_pcfg(
66
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e31-8fb26589.pt"),
67
+ laion400m_e32=_pcfg(
68
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_b_16_plus_240-laion400m_e32-699c4b84.pt"),
69
+ )
70
+
71
+ _VITL14 = dict(
72
+ openai=_pcfg(
73
+ "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt"),
74
+ laion400m_e31=_pcfg(
75
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e31-69988bb6.pt"),
76
+ laion400m_e32=_pcfg(
77
+ "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"),
78
+ laion2b_s32b_b82k=_pcfg(
79
+ hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/',
80
+ mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
81
+ )
82
+
83
+ _EVAL14 = dict(
84
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
85
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_L_psz14.pt'),
86
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
87
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_s4B.pt'),
88
+ )
89
+
90
+ _VITL14_336 = dict(
91
+ openai=_pcfg(
92
+ "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt"),
93
+ )
94
+
95
+ _EVAL14_336 = dict(
96
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
97
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_336_psz14_s6B.pt'),
98
+ eva_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
99
+ eva02_clip_224to336=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_L_psz14_224to336.pt'),
100
+ )
101
+
102
+ _VITH14 = dict(
103
+ laion2b_s32b_b79k=_pcfg(hf_hub='laion/CLIP-ViT-H-14-laion2B-s32B-b79K/'),
104
+ )
105
+
106
+ _VITg14 = dict(
107
+ laion2b_s12b_b42k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s12B-b42K/'),
108
+ laion2b_s34b_b88k=_pcfg(hf_hub='laion/CLIP-ViT-g-14-laion2B-s34B-b88K/'),
109
+ )
110
+
111
+ _EVAg14 = dict(
112
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
113
+ eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
114
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
115
+ eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt'),
116
+ )
117
+
118
+ _EVAg14_PLUS = dict(
119
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/'),
120
+ eva01=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_g_psz14.pt'),
121
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
122
+ eva01_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA01_CLIP_g_14_plus_psz14_s11B.pt'),
123
+ )
124
+
125
+ _VITbigG14 = dict(
126
+ laion2b_s39b_b160k=_pcfg(hf_hub='laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/'),
127
+ )
128
+
129
+ _EVAbigE14 = dict(
130
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
131
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
132
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
133
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_s4B.pt'),
134
+ )
135
+
136
+ _EVAbigE14_PLUS = dict(
137
+ eva=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
138
+ eva02=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_E_psz14.pt'),
139
+ eva_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
140
+ eva02_clip=_pcfg(hf_hub='QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt'),
141
+ )
142
+
143
+
144
+ _PRETRAINED = {
145
+ # "ViT-B-32": _VITB32,
146
+ "OpenaiCLIP-B-32": _VITB32,
147
+ "OpenCLIP-B-32": _VITB32,
148
+
149
+ # "ViT-B-32-quickgelu": _VITB32_quickgelu,
150
+ "OpenaiCLIP-B-32-quickgelu": _VITB32_quickgelu,
151
+ "OpenCLIP-B-32-quickgelu": _VITB32_quickgelu,
152
+
153
+ # "ViT-B-16": _VITB16,
154
+ "OpenaiCLIP-B-16": _VITB16,
155
+ "OpenCLIP-B-16": _VITB16,
156
+
157
+ "EVA02-B-16": _EVAB16,
158
+ "EVA02-CLIP-B-16": _EVAB16,
159
+
160
+ # "ViT-B-16-plus-240": _VITB16_PLUS_240,
161
+ "OpenCLIP-B-16-plus-240": _VITB16_PLUS_240,
162
+
163
+ # "ViT-L-14": _VITL14,
164
+ "OpenaiCLIP-L-14": _VITL14,
165
+ "OpenCLIP-L-14": _VITL14,
166
+
167
+ "EVA02-L-14": _EVAL14,
168
+ "EVA02-CLIP-L-14": _EVAL14,
169
+
170
+ # "ViT-L-14-336": _VITL14_336,
171
+ "OpenaiCLIP-L-14-336": _VITL14_336,
172
+
173
+ "EVA02-CLIP-L-14-336": _EVAL14_336,
174
+
175
+ # "ViT-H-14": _VITH14,
176
+ # "ViT-g-14": _VITg14,
177
+ "OpenCLIP-H-14": _VITH14,
178
+ "OpenCLIP-g-14": _VITg14,
179
+
180
+ "EVA01-CLIP-g-14": _EVAg14,
181
+ "EVA01-CLIP-g-14-plus": _EVAg14_PLUS,
182
+
183
+ # "ViT-bigG-14": _VITbigG14,
184
+ "OpenCLIP-bigG-14": _VITbigG14,
185
+
186
+ "EVA02-CLIP-bigE-14": _EVAbigE14,
187
+ "EVA02-CLIP-bigE-14-plus": _EVAbigE14_PLUS,
188
+ }
189
+
190
+
191
+ def _clean_tag(tag: str):
192
+ # normalize pretrained tags
193
+ return tag.lower().replace('-', '_')
194
+
195
+
196
+ def list_pretrained(as_str: bool = False):
197
+ """ returns list of pretrained models
198
+ Returns a tuple (model_name, pretrain_tag) by default or 'name:tag' if as_str == True
199
+ """
200
+ return [':'.join([k, t]) if as_str else (k, t) for k in _PRETRAINED.keys() for t in _PRETRAINED[k].keys()]
201
+
202
+
203
+ def list_pretrained_models_by_tag(tag: str):
204
+ """ return all models having the specified pretrain tag """
205
+ models = []
206
+ tag = _clean_tag(tag)
207
+ for k in _PRETRAINED.keys():
208
+ if tag in _PRETRAINED[k]:
209
+ models.append(k)
210
+ return models
211
+
212
+
213
+ def list_pretrained_tags_by_model(model: str):
214
+ """ return all pretrain tags for the specified model architecture """
215
+ tags = []
216
+ if model in _PRETRAINED:
217
+ tags.extend(_PRETRAINED[model].keys())
218
+ return tags
219
+
220
+
221
+ def is_pretrained_cfg(model: str, tag: str):
222
+ if model not in _PRETRAINED:
223
+ return False
224
+ return _clean_tag(tag) in _PRETRAINED[model]
225
+
226
+
227
+ def get_pretrained_cfg(model: str, tag: str):
228
+ if model not in _PRETRAINED:
229
+ return {}
230
+ model_pretrained = _PRETRAINED[model]
231
+ return model_pretrained.get(_clean_tag(tag), {})
232
+
233
+
234
+ def get_pretrained_url(model: str, tag: str):
235
+ cfg = get_pretrained_cfg(model, _clean_tag(tag))
236
+ return cfg.get('url', '')
237
+
238
+
239
+ def download_pretrained_from_url(
240
+ url: str,
241
+ cache_dir: Union[str, None] = None,
242
+ ):
243
+ if not cache_dir:
244
+ cache_dir = os.path.expanduser("~/.cache/clip")
245
+ os.makedirs(cache_dir, exist_ok=True)
246
+ filename = os.path.basename(url)
247
+
248
+ if 'openaipublic' in url:
249
+ expected_sha256 = url.split("/")[-2]
250
+ elif 'mlfoundations' in url:
251
+ expected_sha256 = os.path.splitext(filename)[0].split("-")[-1]
252
+ else:
253
+ expected_sha256 = ''
254
+
255
+ download_target = os.path.join(cache_dir, filename)
256
+
257
+ if os.path.exists(download_target) and not os.path.isfile(download_target):
258
+ raise RuntimeError(f"{download_target} exists and is not a regular file")
259
+
260
+ if os.path.isfile(download_target):
261
+ if expected_sha256:
262
+ if hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
263
+ return download_target
264
+ else:
265
+ warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
266
+ else:
267
+ return download_target
268
+
269
+ with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
270
+ with tqdm(total=int(source.headers.get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
271
+ while True:
272
+ buffer = source.read(8192)
273
+ if not buffer:
274
+ break
275
+
276
+ output.write(buffer)
277
+ loop.update(len(buffer))
278
+
279
+ if expected_sha256 and not hashlib.sha256(open(download_target, "rb").read()).hexdigest().startswith(expected_sha256):
280
+ raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
281
+
282
+ return download_target
283
+
284
+
285
+ def has_hf_hub(necessary=False):
286
+ if not _has_hf_hub and necessary:
287
+ # if no HF Hub module installed, and it is necessary to continue, raise error
288
+ raise RuntimeError(
289
+ 'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.')
290
+ return _has_hf_hub
291
+
292
+
293
+ def download_pretrained_from_hf(
294
+ model_id: str,
295
+ filename: str = 'open_clip_pytorch_model.bin',
296
+ revision=None,
297
+ cache_dir: Union[str, None] = None,
298
+ ):
299
+ has_hf_hub(True)
300
+ cached_file = hf_hub_download(model_id, filename, revision=revision, cache_dir=cache_dir)
301
+ return cached_file
302
+
303
+
304
+ def download_pretrained(
305
+ cfg: Dict,
306
+ force_hf_hub: bool = False,
307
+ cache_dir: Union[str, None] = None,
308
+ ):
309
+ target = ''
310
+ if not cfg:
311
+ return target
312
+
313
+ download_url = cfg.get('url', '')
314
+ download_hf_hub = cfg.get('hf_hub', '')
315
+ if download_hf_hub and force_hf_hub:
316
+ # use HF hub even if url exists
317
+ download_url = ''
318
+
319
+ if download_url:
320
+ target = download_pretrained_from_url(download_url, cache_dir=cache_dir)
321
+ elif download_hf_hub:
322
+ has_hf_hub(True)
323
+ # we assume the hf_hub entries in pretrained config combine model_id + filename in
324
+ # 'org/model_name/filename.pt' form. To specify just the model id w/o filename and
325
+ # use 'open_clip_pytorch_model.bin' default, there must be a trailing slash 'org/model_name/'.
326
+ model_id, filename = os.path.split(download_hf_hub)
327
+ if filename:
328
+ target = download_pretrained_from_hf(model_id, filename=filename, cache_dir=cache_dir)
329
+ else:
330
+ target = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
331
+
332
+ return target
PuLID_ComfyUI/eva_clip/rope.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import pi
2
+ import torch
3
+ from torch import nn
4
+ from einops import rearrange, repeat
5
+ import logging
6
+
7
+ def broadcat(tensors, dim = -1):
8
+ num_tensors = len(tensors)
9
+ shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
10
+ assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
11
+ shape_len = list(shape_lens)[0]
12
+ dim = (dim + shape_len) if dim < 0 else dim
13
+ dims = list(zip(*map(lambda t: list(t.shape), tensors)))
14
+ expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
15
+ assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
16
+ max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
17
+ expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
18
+ expanded_dims.insert(dim, (dim, dims[dim]))
19
+ expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
20
+ tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
21
+ return torch.cat(tensors, dim = dim)
22
+
23
+ def rotate_half(x):
24
+ x = rearrange(x, '... (d r) -> ... d r', r = 2)
25
+ x1, x2 = x.unbind(dim = -1)
26
+ x = torch.stack((-x2, x1), dim = -1)
27
+ return rearrange(x, '... d r -> ... (d r)')
28
+
29
+
30
+ class VisionRotaryEmbedding(nn.Module):
31
+ def __init__(
32
+ self,
33
+ dim,
34
+ pt_seq_len,
35
+ ft_seq_len=None,
36
+ custom_freqs = None,
37
+ freqs_for = 'lang',
38
+ theta = 10000,
39
+ max_freq = 10,
40
+ num_freqs = 1,
41
+ ):
42
+ super().__init__()
43
+ if custom_freqs:
44
+ freqs = custom_freqs
45
+ elif freqs_for == 'lang':
46
+ freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
47
+ elif freqs_for == 'pixel':
48
+ freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
49
+ elif freqs_for == 'constant':
50
+ freqs = torch.ones(num_freqs).float()
51
+ else:
52
+ raise ValueError(f'unknown modality {freqs_for}')
53
+
54
+ if ft_seq_len is None: ft_seq_len = pt_seq_len
55
+ t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
56
+
57
+ freqs_h = torch.einsum('..., f -> ... f', t, freqs)
58
+ freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
59
+
60
+ freqs_w = torch.einsum('..., f -> ... f', t, freqs)
61
+ freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
62
+
63
+ freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
64
+
65
+ self.register_buffer("freqs_cos", freqs.cos())
66
+ self.register_buffer("freqs_sin", freqs.sin())
67
+
68
+ logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
69
+
70
+ def forward(self, t, start_index = 0):
71
+ rot_dim = self.freqs_cos.shape[-1]
72
+ end_index = start_index + rot_dim
73
+ assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
74
+ t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
75
+ t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
76
+
77
+ return torch.cat((t_left, t, t_right), dim = -1)
78
+
79
+ class VisionRotaryEmbeddingFast(nn.Module):
80
+ def __init__(
81
+ self,
82
+ dim,
83
+ pt_seq_len,
84
+ ft_seq_len=None,
85
+ custom_freqs = None,
86
+ freqs_for = 'lang',
87
+ theta = 10000,
88
+ max_freq = 10,
89
+ num_freqs = 1,
90
+ patch_dropout = 0.
91
+ ):
92
+ super().__init__()
93
+ if custom_freqs:
94
+ freqs = custom_freqs
95
+ elif freqs_for == 'lang':
96
+ freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
97
+ elif freqs_for == 'pixel':
98
+ freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
99
+ elif freqs_for == 'constant':
100
+ freqs = torch.ones(num_freqs).float()
101
+ else:
102
+ raise ValueError(f'unknown modality {freqs_for}')
103
+
104
+ if ft_seq_len is None: ft_seq_len = pt_seq_len
105
+ t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
106
+
107
+ freqs = torch.einsum('..., f -> ... f', t, freqs)
108
+ freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
109
+ freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
110
+
111
+ freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
112
+ freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
113
+
114
+ self.patch_dropout = patch_dropout
115
+
116
+ self.register_buffer("freqs_cos", freqs_cos)
117
+ self.register_buffer("freqs_sin", freqs_sin)
118
+
119
+ logging.info(f'Shape of rope freq: {self.freqs_cos.shape}')
120
+
121
+ def forward(self, t, patch_indices_keep=None):
122
+ if patch_indices_keep is not None:
123
+ batch = t.size()[0]
124
+ batch_indices = torch.arange(batch)
125
+ batch_indices = batch_indices[..., None]
126
+
127
+ freqs_cos = repeat(self.freqs_cos, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
128
+ freqs_sin = repeat(self.freqs_sin, 'i j -> n i m j', n=t.shape[0], m=t.shape[1])
129
+
130
+ freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
131
+ freqs_cos = rearrange(freqs_cos, 'n i m j -> n m i j')
132
+ freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
133
+ freqs_sin = rearrange(freqs_sin, 'n i m j -> n m i j')
134
+
135
+ return t * freqs_cos + rotate_half(t) * freqs_sin
136
+
137
+ return t * self.freqs_cos + rotate_half(t) * self.freqs_sin
PuLID_ComfyUI/eva_clip/timm_model.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ timm model adapter
2
+
3
+ Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
4
+ """
5
+ import logging
6
+ from collections import OrderedDict
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+ try:
12
+ import timm
13
+ from timm.models.layers import Mlp, to_2tuple
14
+ try:
15
+ # old timm imports < 0.8.1
16
+ from timm.models.layers.attention_pool2d import RotAttentionPool2d
17
+ from timm.models.layers.attention_pool2d import AttentionPool2d as AbsAttentionPool2d
18
+ except ImportError:
19
+ # new timm imports >= 0.8.1
20
+ from timm.layers import RotAttentionPool2d
21
+ from timm.layers import AttentionPool2d as AbsAttentionPool2d
22
+ except ImportError:
23
+ timm = None
24
+
25
+ from .utils import freeze_batch_norm_2d
26
+
27
+
28
+ class TimmModel(nn.Module):
29
+ """ timm model adapter
30
+ # FIXME this adapter is a work in progress, may change in ways that break weight compat
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ model_name,
36
+ embed_dim,
37
+ image_size=224,
38
+ pool='avg',
39
+ proj='linear',
40
+ proj_bias=False,
41
+ drop=0.,
42
+ pretrained=False):
43
+ super().__init__()
44
+ if timm is None:
45
+ raise RuntimeError("Please `pip install timm` to use timm models.")
46
+
47
+ self.image_size = to_2tuple(image_size)
48
+ self.trunk = timm.create_model(model_name, pretrained=pretrained)
49
+ feat_size = self.trunk.default_cfg.get('pool_size', None)
50
+ feature_ndim = 1 if not feat_size else 2
51
+ if pool in ('abs_attn', 'rot_attn'):
52
+ assert feature_ndim == 2
53
+ # if attn pooling used, remove both classifier and default pool
54
+ self.trunk.reset_classifier(0, global_pool='')
55
+ else:
56
+ # reset global pool if pool config set, otherwise leave as network default
57
+ reset_kwargs = dict(global_pool=pool) if pool else {}
58
+ self.trunk.reset_classifier(0, **reset_kwargs)
59
+ prev_chs = self.trunk.num_features
60
+
61
+ head_layers = OrderedDict()
62
+ if pool == 'abs_attn':
63
+ head_layers['pool'] = AbsAttentionPool2d(prev_chs, feat_size=feat_size, out_features=embed_dim)
64
+ prev_chs = embed_dim
65
+ elif pool == 'rot_attn':
66
+ head_layers['pool'] = RotAttentionPool2d(prev_chs, out_features=embed_dim)
67
+ prev_chs = embed_dim
68
+ else:
69
+ assert proj, 'projection layer needed if non-attention pooling is used.'
70
+
71
+ # NOTE attention pool ends with a projection layer, so proj should usually be set to '' if such pooling is used
72
+ if proj == 'linear':
73
+ head_layers['drop'] = nn.Dropout(drop)
74
+ head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias)
75
+ elif proj == 'mlp':
76
+ head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=drop, bias=(True, proj_bias))
77
+
78
+ self.head = nn.Sequential(head_layers)
79
+
80
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
81
+ """ lock modules
82
+ Args:
83
+ unlocked_groups (int): leave last n layer groups unlocked (default: 0)
84
+ """
85
+ if not unlocked_groups:
86
+ # lock full model
87
+ for param in self.trunk.parameters():
88
+ param.requires_grad = False
89
+ if freeze_bn_stats:
90
+ freeze_batch_norm_2d(self.trunk)
91
+ else:
92
+ # NOTE: partial freeze requires latest timm (master) branch and is subject to change
93
+ try:
94
+ # FIXME import here until API stable and in an official release
95
+ from timm.models.helpers import group_parameters, group_modules
96
+ except ImportError:
97
+ raise RuntimeError(
98
+ 'Please install latest timm `pip install git+https://github.com/rwightman/pytorch-image-models`')
99
+ matcher = self.trunk.group_matcher()
100
+ gparams = group_parameters(self.trunk, matcher)
101
+ max_layer_id = max(gparams.keys())
102
+ max_layer_id = max_layer_id - unlocked_groups
103
+ for group_idx in range(max_layer_id + 1):
104
+ group = gparams[group_idx]
105
+ for param in group:
106
+ self.trunk.get_parameter(param).requires_grad = False
107
+ if freeze_bn_stats:
108
+ gmodules = group_modules(self.trunk, matcher, reverse=True)
109
+ gmodules = {k for k, v in gmodules.items() if v <= max_layer_id}
110
+ freeze_batch_norm_2d(self.trunk, gmodules)
111
+
112
+ @torch.jit.ignore
113
+ def set_grad_checkpointing(self, enable=True):
114
+ try:
115
+ self.trunk.set_grad_checkpointing(enable)
116
+ except Exception as e:
117
+ logging.warning('grad checkpointing not supported for this timm image tower, continuing without...')
118
+
119
+ def forward(self, x):
120
+ x = self.trunk(x)
121
+ x = self.head(x)
122
+ return x
PuLID_ComfyUI/eva_clip/tokenizer.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ CLIP tokenizer
2
+
3
+ Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
4
+ """
5
+ import gzip
6
+ import html
7
+ import os
8
+ from functools import lru_cache
9
+ from typing import Union, List
10
+
11
+ import ftfy
12
+ import regex as re
13
+ import torch
14
+
15
+ # https://stackoverflow.com/q/62691279
16
+ import os
17
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
18
+
19
+
20
+ @lru_cache()
21
+ def default_bpe():
22
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
23
+
24
+
25
+ @lru_cache()
26
+ def bytes_to_unicode():
27
+ """
28
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
29
+ The reversible bpe codes work on unicode strings.
30
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
31
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
32
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
33
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
34
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
35
+ """
36
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
37
+ cs = bs[:]
38
+ n = 0
39
+ for b in range(2**8):
40
+ if b not in bs:
41
+ bs.append(b)
42
+ cs.append(2**8+n)
43
+ n += 1
44
+ cs = [chr(n) for n in cs]
45
+ return dict(zip(bs, cs))
46
+
47
+
48
+ def get_pairs(word):
49
+ """Return set of symbol pairs in a word.
50
+ Word is represented as tuple of symbols (symbols being variable-length strings).
51
+ """
52
+ pairs = set()
53
+ prev_char = word[0]
54
+ for char in word[1:]:
55
+ pairs.add((prev_char, char))
56
+ prev_char = char
57
+ return pairs
58
+
59
+
60
+ def basic_clean(text):
61
+ text = ftfy.fix_text(text)
62
+ text = html.unescape(html.unescape(text))
63
+ return text.strip()
64
+
65
+
66
+ def whitespace_clean(text):
67
+ text = re.sub(r'\s+', ' ', text)
68
+ text = text.strip()
69
+ return text
70
+
71
+
72
+ class SimpleTokenizer(object):
73
+ def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
74
+ self.byte_encoder = bytes_to_unicode()
75
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
76
+ merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
77
+ merges = merges[1:49152-256-2+1]
78
+ merges = [tuple(merge.split()) for merge in merges]
79
+ vocab = list(bytes_to_unicode().values())
80
+ vocab = vocab + [v+'</w>' for v in vocab]
81
+ for merge in merges:
82
+ vocab.append(''.join(merge))
83
+ if not special_tokens:
84
+ special_tokens = ['<start_of_text>', '<end_of_text>']
85
+ else:
86
+ special_tokens = ['<start_of_text>', '<end_of_text>'] + special_tokens
87
+ vocab.extend(special_tokens)
88
+ self.encoder = dict(zip(vocab, range(len(vocab))))
89
+ self.decoder = {v: k for k, v in self.encoder.items()}
90
+ self.bpe_ranks = dict(zip(merges, range(len(merges))))
91
+ self.cache = {t:t for t in special_tokens}
92
+ special = "|".join(special_tokens)
93
+ self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
94
+
95
+ self.vocab_size = len(self.encoder)
96
+ self.all_special_ids = [self.encoder[t] for t in special_tokens]
97
+
98
+ def bpe(self, token):
99
+ if token in self.cache:
100
+ return self.cache[token]
101
+ word = tuple(token[:-1]) + ( token[-1] + '</w>',)
102
+ pairs = get_pairs(word)
103
+
104
+ if not pairs:
105
+ return token+'</w>'
106
+
107
+ while True:
108
+ bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
109
+ if bigram not in self.bpe_ranks:
110
+ break
111
+ first, second = bigram
112
+ new_word = []
113
+ i = 0
114
+ while i < len(word):
115
+ try:
116
+ j = word.index(first, i)
117
+ new_word.extend(word[i:j])
118
+ i = j
119
+ except:
120
+ new_word.extend(word[i:])
121
+ break
122
+
123
+ if word[i] == first and i < len(word)-1 and word[i+1] == second:
124
+ new_word.append(first+second)
125
+ i += 2
126
+ else:
127
+ new_word.append(word[i])
128
+ i += 1
129
+ new_word = tuple(new_word)
130
+ word = new_word
131
+ if len(word) == 1:
132
+ break
133
+ else:
134
+ pairs = get_pairs(word)
135
+ word = ' '.join(word)
136
+ self.cache[token] = word
137
+ return word
138
+
139
+ def encode(self, text):
140
+ bpe_tokens = []
141
+ text = whitespace_clean(basic_clean(text)).lower()
142
+ for token in re.findall(self.pat, text):
143
+ token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
144
+ bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
145
+ return bpe_tokens
146
+
147
+ def decode(self, tokens):
148
+ text = ''.join([self.decoder[token] for token in tokens])
149
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
150
+ return text
151
+
152
+
153
+ _tokenizer = SimpleTokenizer()
154
+
155
+
156
+ def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
157
+ """
158
+ Returns the tokenized representation of given input string(s)
159
+
160
+ Parameters
161
+ ----------
162
+ texts : Union[str, List[str]]
163
+ An input string or a list of input strings to tokenize
164
+ context_length : int
165
+ The context length to use; all CLIP models use 77 as the context length
166
+
167
+ Returns
168
+ -------
169
+ A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
170
+ """
171
+ if isinstance(texts, str):
172
+ texts = [texts]
173
+
174
+ sot_token = _tokenizer.encoder["<start_of_text>"]
175
+ eot_token = _tokenizer.encoder["<end_of_text>"]
176
+ all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
177
+ result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
178
+
179
+ for i, tokens in enumerate(all_tokens):
180
+ if len(tokens) > context_length:
181
+ tokens = tokens[:context_length] # Truncate
182
+ tokens[-1] = eot_token
183
+ result[i, :len(tokens)] = torch.tensor(tokens)
184
+
185
+ return result
186
+
187
+
188
+ class HFTokenizer:
189
+ "HuggingFace tokenizer wrapper"
190
+ def __init__(self, tokenizer_name:str):
191
+ from transformers import AutoTokenizer
192
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
193
+
194
+ def __call__(self, texts:Union[str, List[str]], context_length:int=77) -> torch.Tensor:
195
+ # same cleaning as for default tokenizer, except lowercasing
196
+ # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance
197
+ if isinstance(texts, str):
198
+ texts = [texts]
199
+ texts = [whitespace_clean(basic_clean(text)) for text in texts]
200
+ input_ids = self.tokenizer(texts, return_tensors='pt', max_length=context_length, padding='max_length', truncation=True).input_ids
201
+ return input_ids
PuLID_ComfyUI/eva_clip/transform.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Sequence, Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torchvision.transforms.functional as F
6
+
7
+ from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \
8
+ CenterCrop
9
+
10
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
11
+
12
+
13
+ class ResizeMaxSize(nn.Module):
14
+
15
+ def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0):
16
+ super().__init__()
17
+ if not isinstance(max_size, int):
18
+ raise TypeError(f"Size should be int. Got {type(max_size)}")
19
+ self.max_size = max_size
20
+ self.interpolation = interpolation
21
+ self.fn = min if fn == 'min' else min
22
+ self.fill = fill
23
+
24
+ def forward(self, img):
25
+ if isinstance(img, torch.Tensor):
26
+ height, width = img.shape[:2]
27
+ else:
28
+ width, height = img.size
29
+ scale = self.max_size / float(max(height, width))
30
+ if scale != 1.0:
31
+ new_size = tuple(round(dim * scale) for dim in (height, width))
32
+ img = F.resize(img, new_size, self.interpolation)
33
+ pad_h = self.max_size - new_size[0]
34
+ pad_w = self.max_size - new_size[1]
35
+ img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill)
36
+ return img
37
+
38
+
39
+ def _convert_to_rgb(image):
40
+ return image.convert('RGB')
41
+
42
+
43
+ # class CatGen(nn.Module):
44
+ # def __init__(self, num=4):
45
+ # self.num = num
46
+ # def mixgen_batch(image, text):
47
+ # batch_size = image.shape[0]
48
+ # index = np.random.permutation(batch_size)
49
+
50
+ # cat_images = []
51
+ # for i in range(batch_size):
52
+ # # image mixup
53
+ # image[i,:] = lam * image[i,:] + (1 - lam) * image[index[i],:]
54
+ # # text concat
55
+ # text[i] = tokenizer((str(text[i]) + " " + str(text[index[i]])))[0]
56
+ # text = torch.stack(text)
57
+ # return image, text
58
+
59
+
60
+ def image_transform(
61
+ image_size: int,
62
+ is_train: bool,
63
+ mean: Optional[Tuple[float, ...]] = None,
64
+ std: Optional[Tuple[float, ...]] = None,
65
+ resize_longest_max: bool = False,
66
+ fill_color: int = 0,
67
+ ):
68
+ mean = mean or OPENAI_DATASET_MEAN
69
+ if not isinstance(mean, (list, tuple)):
70
+ mean = (mean,) * 3
71
+
72
+ std = std or OPENAI_DATASET_STD
73
+ if not isinstance(std, (list, tuple)):
74
+ std = (std,) * 3
75
+
76
+ if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]:
77
+ # for square size, pass size as int so that Resize() uses aspect preserving shortest edge
78
+ image_size = image_size[0]
79
+
80
+ normalize = Normalize(mean=mean, std=std)
81
+ if is_train:
82
+ return Compose([
83
+ RandomResizedCrop(image_size, scale=(0.9, 1.0), interpolation=InterpolationMode.BICUBIC),
84
+ _convert_to_rgb,
85
+ ToTensor(),
86
+ normalize,
87
+ ])
88
+ else:
89
+ if resize_longest_max:
90
+ transforms = [
91
+ ResizeMaxSize(image_size, fill=fill_color)
92
+ ]
93
+ else:
94
+ transforms = [
95
+ Resize(image_size, interpolation=InterpolationMode.BICUBIC),
96
+ CenterCrop(image_size),
97
+ ]
98
+ transforms.extend([
99
+ _convert_to_rgb,
100
+ ToTensor(),
101
+ normalize,
102
+ ])
103
+ return Compose(transforms)
PuLID_ComfyUI/eva_clip/transformer.py ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from collections import OrderedDict
4
+ import math
5
+ from typing import Callable, Optional, Sequence
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import functional as F
10
+
11
+ try:
12
+ from timm.models.layers import trunc_normal_
13
+ except:
14
+ from timm.layers import trunc_normal_
15
+
16
+ from .rope import VisionRotaryEmbedding, VisionRotaryEmbeddingFast
17
+ from .utils import to_2tuple
18
+
19
+ if os.getenv('ENV_TYPE') == 'deepspeed':
20
+ try:
21
+ import deepspeed
22
+ from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
23
+ except:
24
+ print("Please 'pip install deepspeed'")
25
+ deepspeed = None
26
+ from torch.utils.checkpoint import checkpoint
27
+ else:
28
+ from torch.utils.checkpoint import checkpoint
29
+
30
+ try:
31
+ import xformers.ops as xops
32
+ except ImportError:
33
+ xops = None
34
+ print("Please 'pip install xformers'")
35
+
36
+ class LayerNormFp32(nn.LayerNorm):
37
+ """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
38
+ def __init__(self, *args, **kwargs):
39
+ super().__init__(*args, **kwargs)
40
+
41
+ def forward(self, x: torch.Tensor):
42
+ output = F.layer_norm(
43
+ x.float(),
44
+ self.normalized_shape,
45
+ self.weight.float() if self.weight is not None else None,
46
+ self.bias.float() if self.bias is not None else None,
47
+ self.eps,
48
+ )
49
+ return output.type_as(x)
50
+
51
+
52
+ class LayerNorm(nn.LayerNorm):
53
+ """Subclass torch's LayerNorm (with cast back to input dtype)."""
54
+
55
+ def forward(self, x: torch.Tensor):
56
+ orig_type = x.dtype
57
+ x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
58
+ return x.to(orig_type)
59
+
60
+ class QuickGELU(nn.Module):
61
+ # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
62
+ def forward(self, x: torch.Tensor):
63
+ return x * torch.sigmoid(1.702 * x)
64
+
65
+
66
+ class LayerScale(nn.Module):
67
+ def __init__(self, dim, init_values=1e-5, inplace=False):
68
+ super().__init__()
69
+ self.inplace = inplace
70
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
71
+
72
+ def forward(self, x):
73
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
74
+
75
+ class PatchDropout(nn.Module):
76
+ """
77
+ https://arxiv.org/abs/2212.00794
78
+ """
79
+
80
+ def __init__(self, prob, exclude_first_token=True):
81
+ super().__init__()
82
+ assert 0 <= prob < 1.
83
+ self.prob = prob
84
+ self.exclude_first_token = exclude_first_token # exclude CLS token
85
+ logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}")
86
+
87
+ def forward(self, x):
88
+ if not self.training or self.prob == 0.:
89
+ return x
90
+
91
+ if self.exclude_first_token:
92
+ cls_tokens, x = x[:, :1], x[:, 1:]
93
+ else:
94
+ cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
95
+
96
+ batch = x.size()[0]
97
+ num_tokens = x.size()[1]
98
+
99
+ batch_indices = torch.arange(batch)
100
+ batch_indices = batch_indices[..., None]
101
+
102
+ keep_prob = 1 - self.prob
103
+ num_patches_keep = max(1, int(num_tokens * keep_prob))
104
+
105
+ rand = torch.randn(batch, num_tokens)
106
+ patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
107
+
108
+ x = x[batch_indices, patch_indices_keep]
109
+
110
+ if self.exclude_first_token:
111
+ x = torch.cat((cls_tokens, x), dim=1)
112
+
113
+ if self.training and os.getenv('RoPE') == '1':
114
+ return x, patch_indices_keep
115
+
116
+ return x
117
+
118
+
119
+ def _in_projection_packed(
120
+ q: torch.Tensor,
121
+ k: torch.Tensor,
122
+ v: torch.Tensor,
123
+ w: torch.Tensor,
124
+ b: Optional[torch.Tensor] = None,
125
+ ):
126
+ """
127
+ https://github.com/pytorch/pytorch/blob/db2a237763eb8693a20788be94f8c192e762baa8/torch/nn/functional.py#L4726
128
+ """
129
+ E = q.size(-1)
130
+ if k is v:
131
+ if q is k:
132
+ # self-attention
133
+ return F.linear(q, w, b).chunk(3, dim=-1)
134
+ else:
135
+ # encoder-decoder attention
136
+ w_q, w_kv = w.split([E, E * 2])
137
+ if b is None:
138
+ b_q = b_kv = None
139
+ else:
140
+ b_q, b_kv = b.split([E, E * 2])
141
+ return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1)
142
+ else:
143
+ w_q, w_k, w_v = w.chunk(3)
144
+ if b is None:
145
+ b_q = b_k = b_v = None
146
+ else:
147
+ b_q, b_k, b_v = b.chunk(3)
148
+ return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
149
+
150
+ class Attention(nn.Module):
151
+ def __init__(
152
+ self,
153
+ dim,
154
+ num_heads=8,
155
+ qkv_bias=True,
156
+ scaled_cosine=False,
157
+ scale_heads=False,
158
+ logit_scale_max=math.log(1. / 0.01),
159
+ attn_drop=0.,
160
+ proj_drop=0.,
161
+ xattn=False,
162
+ rope=False
163
+ ):
164
+ super().__init__()
165
+ self.scaled_cosine = scaled_cosine
166
+ self.scale_heads = scale_heads
167
+ assert dim % num_heads == 0, 'dim should be divisible by num_heads'
168
+ self.num_heads = num_heads
169
+ self.head_dim = dim // num_heads
170
+ self.scale = self.head_dim ** -0.5
171
+ self.logit_scale_max = logit_scale_max
172
+
173
+ # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
174
+ self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
175
+ if qkv_bias:
176
+ self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
177
+ else:
178
+ self.in_proj_bias = None
179
+
180
+ if self.scaled_cosine:
181
+ self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
182
+ else:
183
+ self.logit_scale = None
184
+ self.attn_drop = nn.Dropout(attn_drop)
185
+ if self.scale_heads:
186
+ self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
187
+ else:
188
+ self.head_scale = None
189
+ self.out_proj = nn.Linear(dim, dim)
190
+ self.out_drop = nn.Dropout(proj_drop)
191
+ self.xattn = xattn
192
+ self.xattn_drop = attn_drop
193
+ self.rope = rope
194
+
195
+ def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
196
+ L, N, C = x.shape
197
+ q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
198
+ if self.xattn:
199
+ q = q.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
200
+ k = k.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
201
+ v = v.contiguous().view(L, N, self.num_heads, -1).transpose(0, 1)
202
+
203
+ x = xops.memory_efficient_attention(
204
+ q, k, v,
205
+ p=self.xattn_drop,
206
+ scale=self.scale if self.logit_scale is None else None,
207
+ attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None,
208
+ )
209
+ else:
210
+ q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
211
+ k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
212
+ v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
213
+
214
+ if self.logit_scale is not None:
215
+ attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
216
+ logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
217
+ attn = attn.view(N, self.num_heads, L, L) * logit_scale
218
+ attn = attn.view(-1, L, L)
219
+ else:
220
+ q = q * self.scale
221
+ attn = torch.bmm(q, k.transpose(-1, -2))
222
+
223
+ if attn_mask is not None:
224
+ if attn_mask.dtype == torch.bool:
225
+ new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
226
+ new_attn_mask.masked_fill_(attn_mask, float("-inf"))
227
+ attn_mask = new_attn_mask
228
+ attn += attn_mask
229
+
230
+ attn = attn.softmax(dim=-1)
231
+ attn = self.attn_drop(attn)
232
+
233
+ x = torch.bmm(attn, v)
234
+
235
+ if self.head_scale is not None:
236
+ x = x.view(N, self.num_heads, L, C) * self.head_scale
237
+ x = x.view(-1, L, C)
238
+ x = x.transpose(0, 1).reshape(L, N, C)
239
+ x = self.out_proj(x)
240
+ x = self.out_drop(x)
241
+ return x
242
+
243
+ class CustomAttention(nn.Module):
244
+ def __init__(
245
+ self,
246
+ dim,
247
+ num_heads=8,
248
+ qkv_bias=True,
249
+ scaled_cosine=True,
250
+ scale_heads=False,
251
+ logit_scale_max=math.log(1. / 0.01),
252
+ attn_drop=0.,
253
+ proj_drop=0.,
254
+ xattn=False
255
+ ):
256
+ super().__init__()
257
+ self.scaled_cosine = scaled_cosine
258
+ self.scale_heads = scale_heads
259
+ assert dim % num_heads == 0, 'dim should be divisible by num_heads'
260
+ self.num_heads = num_heads
261
+ self.head_dim = dim // num_heads
262
+ self.scale = self.head_dim ** -0.5
263
+ self.logit_scale_max = logit_scale_max
264
+
265
+ # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
266
+ self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
267
+ if qkv_bias:
268
+ self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
269
+ else:
270
+ self.in_proj_bias = None
271
+
272
+ if self.scaled_cosine:
273
+ self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
274
+ else:
275
+ self.logit_scale = None
276
+ self.attn_drop = nn.Dropout(attn_drop)
277
+ if self.scale_heads:
278
+ self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
279
+ else:
280
+ self.head_scale = None
281
+ self.out_proj = nn.Linear(dim, dim)
282
+ self.out_drop = nn.Dropout(proj_drop)
283
+ self.xattn = xattn
284
+ self.xattn_drop = attn_drop
285
+
286
+ def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
287
+ q, k, v = _in_projection_packed(query, key, value, self.in_proj_weight, self.in_proj_bias)
288
+ N_q, B_q, C_q = q.shape
289
+ N_k, B_k, C_k = k.shape
290
+ N_v, B_v, C_v = v.shape
291
+ if self.xattn:
292
+ # B, N, C -> B, N, num_heads, C
293
+ q = q.permute(1, 0, 2).reshape(B_q, N_q, self.num_heads, -1)
294
+ k = k.permute(1, 0, 2).reshape(B_k, N_k, self.num_heads, -1)
295
+ v = v.permute(1, 0, 2).reshape(B_v, N_v, self.num_heads, -1)
296
+
297
+ x = xops.memory_efficient_attention(
298
+ q, k, v,
299
+ p=self.xattn_drop,
300
+ scale=self.scale if self.logit_scale is None else None,
301
+ attn_bias=xops.LowerTriangularMask() if attn_mask is not None else None
302
+ )
303
+ else:
304
+ # B*H, L, C
305
+ q = q.contiguous().view(N_q, B_q * self.num_heads, -1).transpose(0, 1)
306
+ k = k.contiguous().view(N_k, B_k * self.num_heads, -1).transpose(0, 1)
307
+ v = v.contiguous().view(N_v, B_v * self.num_heads, -1).transpose(0, 1)
308
+
309
+ if self.logit_scale is not None:
310
+ # B*H, N_q, N_k
311
+ attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
312
+ logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
313
+ attn = attn.view(B_q, self.num_heads, N_q, N_k) * logit_scale
314
+ attn = attn.view(-1, N_q, N_k)
315
+ else:
316
+ q = q * self.scale
317
+ attn = torch.bmm(q, k.transpose(-1, -2))
318
+
319
+ if attn_mask is not None:
320
+ if attn_mask.dtype == torch.bool:
321
+ new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
322
+ new_attn_mask.masked_fill_(attn_mask, float("-inf"))
323
+ attn_mask = new_attn_mask
324
+ attn += attn_mask
325
+
326
+ attn = attn.softmax(dim=-1)
327
+ attn = self.attn_drop(attn)
328
+
329
+ x = torch.bmm(attn, v)
330
+
331
+ if self.head_scale is not None:
332
+ x = x.view(B_q, self.num_heads, N_q, C_q) * self.head_scale
333
+ x = x.view(-1, N_q, C_q)
334
+ x = x.transpose(0, 1).reshape(N_q, B_q, C_q)
335
+ x = self.out_proj(x)
336
+ x = self.out_drop(x)
337
+ return x
338
+
339
+ class CustomResidualAttentionBlock(nn.Module):
340
+ def __init__(
341
+ self,
342
+ d_model: int,
343
+ n_head: int,
344
+ mlp_ratio: float = 4.0,
345
+ ls_init_value: float = None,
346
+ act_layer: Callable = nn.GELU,
347
+ norm_layer: Callable = LayerNorm,
348
+ scale_cosine_attn: bool = False,
349
+ scale_heads: bool = False,
350
+ scale_attn: bool = False,
351
+ scale_fc: bool = False,
352
+ cross_attn: bool = False,
353
+ xattn: bool = False,
354
+ ):
355
+ super().__init__()
356
+
357
+ self.ln_1 = norm_layer(d_model)
358
+ self.ln_1_k = norm_layer(d_model) if cross_attn else self.ln_1
359
+ self.ln_1_v = norm_layer(d_model) if cross_attn else self.ln_1
360
+ self.attn = CustomAttention(
361
+ d_model, n_head,
362
+ qkv_bias=True,
363
+ attn_drop=0.,
364
+ proj_drop=0.,
365
+ scaled_cosine=scale_cosine_attn,
366
+ scale_heads=scale_heads,
367
+ xattn=xattn
368
+ )
369
+
370
+ self.ln_attn = norm_layer(d_model) if scale_attn else nn.Identity()
371
+ self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
372
+
373
+ self.ln_2 = norm_layer(d_model)
374
+ mlp_width = int(d_model * mlp_ratio)
375
+ self.mlp = nn.Sequential(OrderedDict([
376
+ ("c_fc", nn.Linear(d_model, mlp_width)),
377
+ ('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()),
378
+ ("gelu", act_layer()),
379
+ ("c_proj", nn.Linear(mlp_width, d_model))
380
+ ]))
381
+
382
+ self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
383
+
384
+ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
385
+ q = q + self.ls_1(self.ln_attn(self.attn(self.ln_1(q), self.ln_1_k(k), self.ln_1_v(v), attn_mask=attn_mask)))
386
+ q = q + self.ls_2(self.mlp(self.ln_2(q)))
387
+ return q
388
+
389
+ class CustomTransformer(nn.Module):
390
+ def __init__(
391
+ self,
392
+ width: int,
393
+ layers: int,
394
+ heads: int,
395
+ mlp_ratio: float = 4.0,
396
+ ls_init_value: float = None,
397
+ act_layer: Callable = nn.GELU,
398
+ norm_layer: Callable = LayerNorm,
399
+ scale_cosine_attn: bool = True,
400
+ scale_heads: bool = False,
401
+ scale_attn: bool = False,
402
+ scale_fc: bool = False,
403
+ cross_attn: bool = False,
404
+ xattn: bool = False,
405
+ ):
406
+ super().__init__()
407
+ self.width = width
408
+ self.layers = layers
409
+ self.grad_checkpointing = False
410
+ self.xattn = xattn
411
+
412
+ self.resblocks = nn.ModuleList([
413
+ CustomResidualAttentionBlock(
414
+ width,
415
+ heads,
416
+ mlp_ratio,
417
+ ls_init_value=ls_init_value,
418
+ act_layer=act_layer,
419
+ norm_layer=norm_layer,
420
+ scale_cosine_attn=scale_cosine_attn,
421
+ scale_heads=scale_heads,
422
+ scale_attn=scale_attn,
423
+ scale_fc=scale_fc,
424
+ cross_attn=cross_attn,
425
+ xattn=xattn)
426
+ for _ in range(layers)
427
+ ])
428
+
429
+ def get_cast_dtype(self) -> torch.dtype:
430
+ return self.resblocks[0].mlp.c_fc.weight.dtype
431
+
432
+ def forward(self, q: torch.Tensor, k: torch.Tensor = None, v: torch.Tensor = None, attn_mask: Optional[torch.Tensor] = None):
433
+ if k is None and v is None:
434
+ k = v = q
435
+ for r in self.resblocks:
436
+ if self.grad_checkpointing and not torch.jit.is_scripting():
437
+ q = checkpoint(r, q, k, v, attn_mask)
438
+ else:
439
+ q = r(q, k, v, attn_mask=attn_mask)
440
+ return q
441
+
442
+
443
+ class ResidualAttentionBlock(nn.Module):
444
+ def __init__(
445
+ self,
446
+ d_model: int,
447
+ n_head: int,
448
+ mlp_ratio: float = 4.0,
449
+ ls_init_value: float = None,
450
+ act_layer: Callable = nn.GELU,
451
+ norm_layer: Callable = LayerNorm,
452
+ xattn: bool = False,
453
+ ):
454
+ super().__init__()
455
+
456
+ self.ln_1 = norm_layer(d_model)
457
+ if xattn:
458
+ self.attn = Attention(d_model, n_head, xattn=True)
459
+ else:
460
+ self.attn = nn.MultiheadAttention(d_model, n_head)
461
+ self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
462
+
463
+ self.ln_2 = norm_layer(d_model)
464
+ mlp_width = int(d_model * mlp_ratio)
465
+ self.mlp = nn.Sequential(OrderedDict([
466
+ ("c_fc", nn.Linear(d_model, mlp_width)),
467
+ ("gelu", act_layer()),
468
+ ("c_proj", nn.Linear(mlp_width, d_model))
469
+ ]))
470
+
471
+ self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
472
+ self.xattn = xattn
473
+
474
+ def attention(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
475
+ attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
476
+ if self.xattn:
477
+ return self.attn(x, attn_mask=attn_mask)
478
+ return self.attn(x, x, x, need_weights=False, attn_mask=attn_mask)[0]
479
+
480
+ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
481
+ x = x + self.ls_1(self.attention(self.ln_1(x), attn_mask=attn_mask))
482
+ x = x + self.ls_2(self.mlp(self.ln_2(x)))
483
+ return x
484
+
485
+ class Transformer(nn.Module):
486
+ def __init__(
487
+ self,
488
+ width: int,
489
+ layers: int,
490
+ heads: int,
491
+ mlp_ratio: float = 4.0,
492
+ ls_init_value: float = None,
493
+ act_layer: Callable = nn.GELU,
494
+ norm_layer: Callable = LayerNorm,
495
+ xattn: bool = False,
496
+ ):
497
+ super().__init__()
498
+ self.width = width
499
+ self.layers = layers
500
+ self.grad_checkpointing = False
501
+
502
+ self.resblocks = nn.ModuleList([
503
+ ResidualAttentionBlock(
504
+ width, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, xattn=xattn)
505
+ for _ in range(layers)
506
+ ])
507
+
508
+ def get_cast_dtype(self) -> torch.dtype:
509
+ return self.resblocks[0].mlp.c_fc.weight.dtype
510
+
511
+ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
512
+ for r in self.resblocks:
513
+ if self.grad_checkpointing and not torch.jit.is_scripting():
514
+ x = checkpoint(r, x, attn_mask)
515
+ else:
516
+ x = r(x, attn_mask=attn_mask)
517
+ return x
518
+
519
+
520
+ class VisionTransformer(nn.Module):
521
+ def __init__(
522
+ self,
523
+ image_size: int,
524
+ patch_size: int,
525
+ width: int,
526
+ layers: int,
527
+ heads: int,
528
+ mlp_ratio: float,
529
+ ls_init_value: float = None,
530
+ patch_dropout: float = 0.,
531
+ global_average_pool: bool = False,
532
+ output_dim: int = 512,
533
+ act_layer: Callable = nn.GELU,
534
+ norm_layer: Callable = LayerNorm,
535
+ xattn: bool = False,
536
+ ):
537
+ super().__init__()
538
+ self.image_size = to_2tuple(image_size)
539
+ self.patch_size = to_2tuple(patch_size)
540
+ self.grid_size = (self.image_size[0] // self.patch_size[0], self.image_size[1] // self.patch_size[1])
541
+ self.output_dim = output_dim
542
+ self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
543
+
544
+ scale = width ** -0.5
545
+ self.class_embedding = nn.Parameter(scale * torch.randn(width))
546
+ self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width))
547
+
548
+ # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
549
+ self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
550
+ self.ln_pre = norm_layer(width)
551
+
552
+ self.transformer = Transformer(
553
+ width,
554
+ layers,
555
+ heads,
556
+ mlp_ratio,
557
+ ls_init_value=ls_init_value,
558
+ act_layer=act_layer,
559
+ norm_layer=norm_layer,
560
+ xattn=xattn
561
+ )
562
+
563
+ self.global_average_pool = global_average_pool
564
+ self.ln_post = norm_layer(width)
565
+ self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
566
+
567
+ def lock(self, unlocked_groups=0, freeze_bn_stats=False):
568
+ for param in self.parameters():
569
+ param.requires_grad = False
570
+
571
+ if unlocked_groups != 0:
572
+ groups = [
573
+ [
574
+ self.conv1,
575
+ self.class_embedding,
576
+ self.positional_embedding,
577
+ self.ln_pre,
578
+ ],
579
+ *self.transformer.resblocks[:-1],
580
+ [
581
+ self.transformer.resblocks[-1],
582
+ self.ln_post,
583
+ ],
584
+ self.proj,
585
+ ]
586
+
587
+ def _unlock(x):
588
+ if isinstance(x, Sequence):
589
+ for g in x:
590
+ _unlock(g)
591
+ else:
592
+ if isinstance(x, torch.nn.Parameter):
593
+ x.requires_grad = True
594
+ else:
595
+ for p in x.parameters():
596
+ p.requires_grad = True
597
+
598
+ _unlock(groups[-unlocked_groups:])
599
+
600
+ def get_num_layers(self):
601
+ return self.transformer.layers
602
+
603
+ @torch.jit.ignore
604
+ def set_grad_checkpointing(self, enable=True):
605
+ self.transformer.grad_checkpointing = enable
606
+
607
+ @torch.jit.ignore
608
+ def no_weight_decay(self):
609
+ return {'positional_embedding', 'class_embedding'}
610
+
611
+ def forward(self, x: torch.Tensor, return_all_features: bool=False):
612
+ x = self.conv1(x) # shape = [*, width, grid, grid]
613
+ x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
614
+ x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
615
+ x = torch.cat(
616
+ [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
617
+ x], dim=1) # shape = [*, grid ** 2 + 1, width]
618
+ x = x + self.positional_embedding.to(x.dtype)
619
+
620
+ # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
621
+ x = self.patch_dropout(x)
622
+ x = self.ln_pre(x)
623
+
624
+ x = x.permute(1, 0, 2) # NLD -> LND
625
+ x = self.transformer(x)
626
+ x = x.permute(1, 0, 2) # LND -> NLD
627
+
628
+ if not return_all_features:
629
+ if self.global_average_pool:
630
+ x = x.mean(dim=1) #x = x[:,1:,:].mean(dim=1)
631
+ else:
632
+ x = x[:, 0]
633
+
634
+ x = self.ln_post(x)
635
+
636
+ if self.proj is not None:
637
+ x = x @ self.proj
638
+
639
+ return x
640
+
641
+
642
+ class TextTransformer(nn.Module):
643
+ def __init__(
644
+ self,
645
+ context_length: int = 77,
646
+ vocab_size: int = 49408,
647
+ width: int = 512,
648
+ heads: int = 8,
649
+ layers: int = 12,
650
+ ls_init_value: float = None,
651
+ output_dim: int = 512,
652
+ act_layer: Callable = nn.GELU,
653
+ norm_layer: Callable = LayerNorm,
654
+ xattn: bool= False,
655
+ attn_mask: bool = True
656
+ ):
657
+ super().__init__()
658
+ self.context_length = context_length
659
+ self.vocab_size = vocab_size
660
+ self.width = width
661
+ self.output_dim = output_dim
662
+
663
+ self.token_embedding = nn.Embedding(vocab_size, width)
664
+ self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width))
665
+ self.transformer = Transformer(
666
+ width=width,
667
+ layers=layers,
668
+ heads=heads,
669
+ ls_init_value=ls_init_value,
670
+ act_layer=act_layer,
671
+ norm_layer=norm_layer,
672
+ xattn=xattn
673
+ )
674
+
675
+ self.xattn = xattn
676
+ self.ln_final = norm_layer(width)
677
+ self.text_projection = nn.Parameter(torch.empty(width, output_dim))
678
+
679
+ if attn_mask:
680
+ self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False)
681
+ else:
682
+ self.attn_mask = None
683
+
684
+ self.init_parameters()
685
+
686
+ def init_parameters(self):
687
+ nn.init.normal_(self.token_embedding.weight, std=0.02)
688
+ nn.init.normal_(self.positional_embedding, std=0.01)
689
+
690
+ proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
691
+ attn_std = self.transformer.width ** -0.5
692
+ fc_std = (2 * self.transformer.width) ** -0.5
693
+ for block in self.transformer.resblocks:
694
+ nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
695
+ nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
696
+ nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
697
+ nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
698
+
699
+ if self.text_projection is not None:
700
+ nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
701
+
702
+ @torch.jit.ignore
703
+ def set_grad_checkpointing(self, enable=True):
704
+ self.transformer.grad_checkpointing = enable
705
+
706
+ @torch.jit.ignore
707
+ def no_weight_decay(self):
708
+ # return {'positional_embedding', 'token_embedding'}
709
+ return {'positional_embedding'}
710
+
711
+ def get_num_layers(self):
712
+ return self.transformer.layers
713
+
714
+ def build_attention_mask(self):
715
+ # lazily create causal attention mask, with full attention between the vision tokens
716
+ # pytorch uses additive attention mask; fill with -inf
717
+ mask = torch.empty(self.context_length, self.context_length)
718
+ mask.fill_(float("-inf"))
719
+ mask.triu_(1) # zero out the lower diagonal
720
+ return mask
721
+
722
+ def forward(self, text, return_all_features: bool=False):
723
+ cast_dtype = self.transformer.get_cast_dtype()
724
+ x = self.token_embedding(text).to(cast_dtype) # [batch_size, n_ctx, d_model]
725
+
726
+ x = x + self.positional_embedding.to(cast_dtype)
727
+ x = x.permute(1, 0, 2) # NLD -> LND
728
+ x = self.transformer(x, attn_mask=self.attn_mask)
729
+ # x = self.transformer(x) # no attention mask is applied
730
+ x = x.permute(1, 0, 2) # LND -> NLD
731
+ x = self.ln_final(x)
732
+
733
+ if not return_all_features:
734
+ # x.shape = [batch_size, n_ctx, transformer.width]
735
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
736
+ x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
737
+ return x
PuLID_ComfyUI/eva_clip/utils.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import repeat
2
+ import collections.abc
3
+ import logging
4
+ import math
5
+ import numpy as np
6
+
7
+ import torch
8
+ from torch import nn as nn
9
+ from torchvision.ops.misc import FrozenBatchNorm2d
10
+ import torch.nn.functional as F
11
+
12
+ # open CLIP
13
+ def resize_clip_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
14
+ # Rescale the grid of position embeddings when loading from state_dict
15
+ old_pos_embed = state_dict.get('visual.positional_embedding', None)
16
+ if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
17
+ return
18
+ grid_size = to_2tuple(model.visual.grid_size)
19
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
20
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
21
+ if new_seq_len == old_pos_embed.shape[0]:
22
+ return
23
+
24
+ if extra_tokens:
25
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
26
+ else:
27
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
28
+ old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
29
+
30
+ logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
31
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
32
+ pos_emb_img = F.interpolate(
33
+ pos_emb_img,
34
+ size=grid_size,
35
+ mode=interpolation,
36
+ align_corners=True,
37
+ )
38
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
39
+ if pos_emb_tok is not None:
40
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
41
+ else:
42
+ new_pos_embed = pos_emb_img
43
+ state_dict['visual.positional_embedding'] = new_pos_embed
44
+
45
+
46
+ def resize_visual_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
47
+ # Rescale the grid of position embeddings when loading from state_dict
48
+ old_pos_embed = state_dict.get('positional_embedding', None)
49
+ if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
50
+ return
51
+ grid_size = to_2tuple(model.visual.grid_size)
52
+ extra_tokens = 1 # FIXME detect different token configs (ie no class token, or more)
53
+ new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
54
+ if new_seq_len == old_pos_embed.shape[0]:
55
+ return
56
+
57
+ if extra_tokens:
58
+ pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
59
+ else:
60
+ pos_emb_tok, pos_emb_img = None, old_pos_embed
61
+ old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
62
+
63
+ logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
64
+ pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
65
+ pos_emb_img = F.interpolate(
66
+ pos_emb_img,
67
+ size=grid_size,
68
+ mode=interpolation,
69
+ align_corners=True,
70
+ )
71
+ pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
72
+ if pos_emb_tok is not None:
73
+ new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
74
+ else:
75
+ new_pos_embed = pos_emb_img
76
+ state_dict['positional_embedding'] = new_pos_embed
77
+
78
+ def resize_evaclip_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
79
+ all_keys = list(state_dict.keys())
80
+ # interpolate position embedding
81
+ if 'visual.pos_embed' in state_dict:
82
+ pos_embed_checkpoint = state_dict['visual.pos_embed']
83
+ embedding_size = pos_embed_checkpoint.shape[-1]
84
+ num_patches = model.visual.patch_embed.num_patches
85
+ num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
86
+ # height (== width) for the checkpoint position embedding
87
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
88
+ # height (== width) for the new position embedding
89
+ new_size = int(num_patches ** 0.5)
90
+ # class_token and dist_token are kept unchanged
91
+ if orig_size != new_size:
92
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
93
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
94
+ # only the position tokens are interpolated
95
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
96
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
97
+ pos_tokens = torch.nn.functional.interpolate(
98
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
99
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
100
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
101
+ state_dict['visual.pos_embed'] = new_pos_embed
102
+
103
+ patch_embed_proj = state_dict['visual.patch_embed.proj.weight']
104
+ patch_size = model.visual.patch_embed.patch_size
105
+ state_dict['visual.patch_embed.proj.weight'] = torch.nn.functional.interpolate(
106
+ patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
107
+
108
+
109
+ def resize_eva_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
110
+ all_keys = list(state_dict.keys())
111
+ # interpolate position embedding
112
+ if 'pos_embed' in state_dict:
113
+ pos_embed_checkpoint = state_dict['pos_embed']
114
+ embedding_size = pos_embed_checkpoint.shape[-1]
115
+ num_patches = model.visual.patch_embed.num_patches
116
+ num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
117
+ # height (== width) for the checkpoint position embedding
118
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
119
+ # height (== width) for the new position embedding
120
+ new_size = int(num_patches ** 0.5)
121
+ # class_token and dist_token are kept unchanged
122
+ if orig_size != new_size:
123
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
124
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
125
+ # only the position tokens are interpolated
126
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
127
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
128
+ pos_tokens = torch.nn.functional.interpolate(
129
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
130
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
131
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
132
+ state_dict['pos_embed'] = new_pos_embed
133
+
134
+ patch_embed_proj = state_dict['patch_embed.proj.weight']
135
+ patch_size = model.visual.patch_embed.patch_size
136
+ state_dict['patch_embed.proj.weight'] = torch.nn.functional.interpolate(
137
+ patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
138
+
139
+
140
+ def resize_rel_pos_embed(state_dict, model, interpolation: str = 'bicubic', seq_dim=1):
141
+ all_keys = list(state_dict.keys())
142
+ for key in all_keys:
143
+ if "relative_position_index" in key:
144
+ state_dict.pop(key)
145
+
146
+ if "relative_position_bias_table" in key:
147
+ rel_pos_bias = state_dict[key]
148
+ src_num_pos, num_attn_heads = rel_pos_bias.size()
149
+ dst_num_pos, _ = model.visual.state_dict()[key].size()
150
+ dst_patch_shape = model.visual.patch_embed.patch_shape
151
+ if dst_patch_shape[0] != dst_patch_shape[1]:
152
+ raise NotImplementedError()
153
+ num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (dst_patch_shape[1] * 2 - 1)
154
+ src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
155
+ dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
156
+ if src_size != dst_size:
157
+ print("Position interpolate for %s from %dx%d to %dx%d" % (
158
+ key, src_size, src_size, dst_size, dst_size))
159
+ extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
160
+ rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
161
+
162
+ def geometric_progression(a, r, n):
163
+ return a * (1.0 - r ** n) / (1.0 - r)
164
+
165
+ left, right = 1.01, 1.5
166
+ while right - left > 1e-6:
167
+ q = (left + right) / 2.0
168
+ gp = geometric_progression(1, q, src_size // 2)
169
+ if gp > dst_size // 2:
170
+ right = q
171
+ else:
172
+ left = q
173
+
174
+ # if q > 1.090307:
175
+ # q = 1.090307
176
+
177
+ dis = []
178
+ cur = 1
179
+ for i in range(src_size // 2):
180
+ dis.append(cur)
181
+ cur += q ** (i + 1)
182
+
183
+ r_ids = [-_ for _ in reversed(dis)]
184
+
185
+ x = r_ids + [0] + dis
186
+ y = r_ids + [0] + dis
187
+
188
+ t = dst_size // 2.0
189
+ dx = np.arange(-t, t + 0.1, 1.0)
190
+ dy = np.arange(-t, t + 0.1, 1.0)
191
+
192
+ print("Original positions = %s" % str(x))
193
+ print("Target positions = %s" % str(dx))
194
+
195
+ all_rel_pos_bias = []
196
+
197
+ for i in range(num_attn_heads):
198
+ z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
199
+ f = F.interpolate.interp2d(x, y, z, kind='cubic')
200
+ all_rel_pos_bias.append(
201
+ torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))
202
+
203
+ rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
204
+
205
+ new_rel_pos_bias = torch.cat((rel_pos_bias, extra_tokens), dim=0)
206
+ state_dict[key] = new_rel_pos_bias
207
+
208
+ # interpolate position embedding
209
+ if 'pos_embed' in state_dict:
210
+ pos_embed_checkpoint = state_dict['pos_embed']
211
+ embedding_size = pos_embed_checkpoint.shape[-1]
212
+ num_patches = model.visual.patch_embed.num_patches
213
+ num_extra_tokens = model.visual.pos_embed.shape[-2] - num_patches
214
+ # height (== width) for the checkpoint position embedding
215
+ orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
216
+ # height (== width) for the new position embedding
217
+ new_size = int(num_patches ** 0.5)
218
+ # class_token and dist_token are kept unchanged
219
+ if orig_size != new_size:
220
+ print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
221
+ extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
222
+ # only the position tokens are interpolated
223
+ pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
224
+ pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
225
+ pos_tokens = torch.nn.functional.interpolate(
226
+ pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
227
+ pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
228
+ new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
229
+ state_dict['pos_embed'] = new_pos_embed
230
+
231
+ patch_embed_proj = state_dict['patch_embed.proj.weight']
232
+ patch_size = model.visual.patch_embed.patch_size
233
+ state_dict['patch_embed.proj.weight'] = torch.nn.functional.interpolate(
234
+ patch_embed_proj.float(), size=patch_size, mode='bicubic', align_corners=False)
235
+
236
+
237
+ def freeze_batch_norm_2d(module, module_match={}, name=''):
238
+ """
239
+ Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is
240
+ itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and
241
+ returned. Otherwise, the module is walked recursively and submodules are converted in place.
242
+
243
+ Args:
244
+ module (torch.nn.Module): Any PyTorch module.
245
+ module_match (dict): Dictionary of full module names to freeze (all if empty)
246
+ name (str): Full module name (prefix)
247
+
248
+ Returns:
249
+ torch.nn.Module: Resulting module
250
+
251
+ Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762
252
+ """
253
+ res = module
254
+ is_match = True
255
+ if module_match:
256
+ is_match = name in module_match
257
+ if is_match and isinstance(module, (nn.modules.batchnorm.BatchNorm2d, nn.modules.batchnorm.SyncBatchNorm)):
258
+ res = FrozenBatchNorm2d(module.num_features)
259
+ res.num_features = module.num_features
260
+ res.affine = module.affine
261
+ if module.affine:
262
+ res.weight.data = module.weight.data.clone().detach()
263
+ res.bias.data = module.bias.data.clone().detach()
264
+ res.running_mean.data = module.running_mean.data
265
+ res.running_var.data = module.running_var.data
266
+ res.eps = module.eps
267
+ else:
268
+ for child_name, child in module.named_children():
269
+ full_child_name = '.'.join([name, child_name]) if name else child_name
270
+ new_child = freeze_batch_norm_2d(child, module_match, full_child_name)
271
+ if new_child is not child:
272
+ res.add_module(child_name, new_child)
273
+ return res
274
+
275
+
276
+ # From PyTorch internals
277
+ def _ntuple(n):
278
+ def parse(x):
279
+ if isinstance(x, collections.abc.Iterable):
280
+ return x
281
+ return tuple(repeat(x, n))
282
+ return parse
283
+
284
+
285
+ to_1tuple = _ntuple(1)
286
+ to_2tuple = _ntuple(2)
287
+ to_3tuple = _ntuple(3)
288
+ to_4tuple = _ntuple(4)
289
+ to_ntuple = lambda n, x: _ntuple(n)(x)
290
+
291
+
292
+ def is_logging(args):
293
+ def is_global_master(args):
294
+ return args.rank == 0
295
+
296
+ def is_local_master(args):
297
+ return args.local_rank == 0
298
+
299
+ def is_master(args, local=False):
300
+ return is_local_master(args) if local else is_global_master(args)
301
+ return is_master
302
+
303
+
304
+ class AllGather(torch.autograd.Function):
305
+ """An autograd function that performs allgather on a tensor.
306
+ Performs all_gather operation on the provided tensors.
307
+ *** Warning ***: torch.distributed.all_gather has no gradient.
308
+ """
309
+
310
+ @staticmethod
311
+ def forward(ctx, tensor, rank, world_size):
312
+ tensors_gather = [torch.empty_like(tensor) for _ in range(world_size)]
313
+ torch.distributed.all_gather(tensors_gather, tensor)
314
+ ctx.rank = rank
315
+ ctx.batch_size = tensor.shape[0]
316
+ return torch.cat(tensors_gather, 0)
317
+
318
+ @staticmethod
319
+ def backward(ctx, grad_output):
320
+ return (
321
+ grad_output[ctx.batch_size * ctx.rank: ctx.batch_size * (ctx.rank + 1)],
322
+ None,
323
+ None
324
+ )
325
+
326
+ allgather = AllGather.apply
PuLID_ComfyUI/examples/PuLID_4-Step_lightning.json ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 43,
3
+ "last_link_id": 128,
4
+ "nodes": [
5
+ {
6
+ "id": 8,
7
+ "type": "VAEDecode",
8
+ "pos": [
9
+ 1210,
10
+ -270
11
+ ],
12
+ "size": {
13
+ "0": 140,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 11,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "samples",
22
+ "type": "LATENT",
23
+ "link": 7
24
+ },
25
+ {
26
+ "name": "vae",
27
+ "type": "VAE",
28
+ "link": 8
29
+ }
30
+ ],
31
+ "outputs": [
32
+ {
33
+ "name": "IMAGE",
34
+ "type": "IMAGE",
35
+ "links": [
36
+ 10
37
+ ],
38
+ "slot_index": 0
39
+ }
40
+ ],
41
+ "properties": {
42
+ "Node name for S&R": "VAEDecode"
43
+ }
44
+ },
45
+ {
46
+ "id": 19,
47
+ "type": "PulidEvaClipLoader",
48
+ "pos": [
49
+ 130,
50
+ 120
51
+ ],
52
+ "size": {
53
+ "0": 140,
54
+ "1": 26
55
+ },
56
+ "flags": {},
57
+ "order": 0,
58
+ "mode": 0,
59
+ "outputs": [
60
+ {
61
+ "name": "EVA_CLIP",
62
+ "type": "EVA_CLIP",
63
+ "links": [
64
+ 81
65
+ ],
66
+ "shape": 3,
67
+ "slot_index": 0
68
+ }
69
+ ],
70
+ "properties": {
71
+ "Node name for S&R": "PulidEvaClipLoader"
72
+ }
73
+ },
74
+ {
75
+ "id": 17,
76
+ "type": "PulidInsightFaceLoader",
77
+ "pos": [
78
+ 60,
79
+ 190
80
+ ],
81
+ "size": {
82
+ "0": 210,
83
+ "1": 58
84
+ },
85
+ "flags": {},
86
+ "order": 1,
87
+ "mode": 0,
88
+ "outputs": [
89
+ {
90
+ "name": "FACEANALYSIS",
91
+ "type": "FACEANALYSIS",
92
+ "links": [
93
+ 82
94
+ ],
95
+ "shape": 3,
96
+ "slot_index": 0
97
+ }
98
+ ],
99
+ "properties": {
100
+ "Node name for S&R": "PulidInsightFaceLoader"
101
+ },
102
+ "widgets_values": [
103
+ "CPU"
104
+ ]
105
+ },
106
+ {
107
+ "id": 23,
108
+ "type": "CLIPTextEncode",
109
+ "pos": [
110
+ 330,
111
+ -260
112
+ ],
113
+ "size": {
114
+ "0": 334.8077697753906,
115
+ "1": 189.35675048828125
116
+ },
117
+ "flags": {},
118
+ "order": 8,
119
+ "mode": 0,
120
+ "inputs": [
121
+ {
122
+ "name": "clip",
123
+ "type": "CLIP",
124
+ "link": 94
125
+ }
126
+ ],
127
+ "outputs": [
128
+ {
129
+ "name": "CONDITIONING",
130
+ "type": "CONDITIONING",
131
+ "links": [
132
+ 34
133
+ ],
134
+ "shape": 3,
135
+ "slot_index": 0
136
+ }
137
+ ],
138
+ "properties": {
139
+ "Node name for S&R": "CLIPTextEncode"
140
+ },
141
+ "widgets_values": [
142
+ "flaws in the eyes, flaws in the face, flaws, lowres, non-HDRi, low quality, worst quality,artifacts noise, text, watermark, glitch, deformed, mutated, ugly, disfigured, hands, low resolution, partially rendered objects, deformed or partially rendered eyes, deformed, deformed eyeballs, cross-eyed,blurry"
143
+ ]
144
+ },
145
+ {
146
+ "id": 22,
147
+ "type": "CLIPTextEncode",
148
+ "pos": [
149
+ 340,
150
+ -430
151
+ ],
152
+ "size": {
153
+ "0": 315.23089599609375,
154
+ "1": 113.96450805664062
155
+ },
156
+ "flags": {},
157
+ "order": 7,
158
+ "mode": 0,
159
+ "inputs": [
160
+ {
161
+ "name": "clip",
162
+ "type": "CLIP",
163
+ "link": 93
164
+ }
165
+ ],
166
+ "outputs": [
167
+ {
168
+ "name": "CONDITIONING",
169
+ "type": "CONDITIONING",
170
+ "links": [
171
+ 35
172
+ ],
173
+ "shape": 3,
174
+ "slot_index": 0
175
+ }
176
+ ],
177
+ "properties": {
178
+ "Node name for S&R": "CLIPTextEncode"
179
+ },
180
+ "widgets_values": [
181
+ "portrait,cinematic,wolf ears,white hair"
182
+ ]
183
+ },
184
+ {
185
+ "id": 10,
186
+ "type": "PreviewImage",
187
+ "pos": [
188
+ 1230,
189
+ -160
190
+ ],
191
+ "size": [
192
+ 855.3022058439137,
193
+ 1107.2183523542942
194
+ ],
195
+ "flags": {},
196
+ "order": 12,
197
+ "mode": 0,
198
+ "inputs": [
199
+ {
200
+ "name": "images",
201
+ "type": "IMAGE",
202
+ "link": 10
203
+ }
204
+ ],
205
+ "properties": {
206
+ "Node name for S&R": "PreviewImage"
207
+ }
208
+ },
209
+ {
210
+ "id": 12,
211
+ "type": "LoadImage",
212
+ "pos": [
213
+ -117,
214
+ 336
215
+ ],
216
+ "size": {
217
+ "0": 404.07366943359375,
218
+ "1": 496.2817077636719
219
+ },
220
+ "flags": {},
221
+ "order": 2,
222
+ "mode": 0,
223
+ "outputs": [
224
+ {
225
+ "name": "IMAGE",
226
+ "type": "IMAGE",
227
+ "links": [
228
+ 114
229
+ ],
230
+ "shape": 3,
231
+ "slot_index": 0
232
+ },
233
+ {
234
+ "name": "MASK",
235
+ "type": "MASK",
236
+ "links": null,
237
+ "shape": 3
238
+ }
239
+ ],
240
+ "properties": {
241
+ "Node name for S&R": "LoadImage"
242
+ },
243
+ "widgets_values": [
244
+ "monalisa.png",
245
+ "image"
246
+ ]
247
+ },
248
+ {
249
+ "id": 5,
250
+ "type": "EmptyLatentImage",
251
+ "pos": [
252
+ 353,
253
+ 286
254
+ ],
255
+ "size": {
256
+ "0": 315,
257
+ "1": 106
258
+ },
259
+ "flags": {},
260
+ "order": 3,
261
+ "mode": 0,
262
+ "outputs": [
263
+ {
264
+ "name": "LATENT",
265
+ "type": "LATENT",
266
+ "links": [
267
+ 2
268
+ ],
269
+ "slot_index": 0
270
+ }
271
+ ],
272
+ "properties": {
273
+ "Node name for S&R": "EmptyLatentImage"
274
+ },
275
+ "widgets_values": [
276
+ 768,
277
+ 1024,
278
+ 1
279
+ ]
280
+ },
281
+ {
282
+ "id": 16,
283
+ "type": "PulidModelLoader",
284
+ "pos": [
285
+ -20,
286
+ 20
287
+ ],
288
+ "size": {
289
+ "0": 304.0072021484375,
290
+ "1": 58
291
+ },
292
+ "flags": {},
293
+ "order": 4,
294
+ "mode": 0,
295
+ "outputs": [
296
+ {
297
+ "name": "PULID",
298
+ "type": "PULID",
299
+ "links": [
300
+ 117
301
+ ],
302
+ "shape": 3,
303
+ "slot_index": 0
304
+ }
305
+ ],
306
+ "properties": {
307
+ "Node name for S&R": "PulidModelLoader"
308
+ },
309
+ "widgets_values": [
310
+ "ip-adapter_pulid_sdxl_fp16.safetensors"
311
+ ]
312
+ },
313
+ {
314
+ "id": 4,
315
+ "type": "CheckpointLoaderSimple",
316
+ "pos": [
317
+ -130,
318
+ -350
319
+ ],
320
+ "size": {
321
+ "0": 319.03692626953125,
322
+ "1": 101.3391342163086
323
+ },
324
+ "flags": {},
325
+ "order": 5,
326
+ "mode": 0,
327
+ "outputs": [
328
+ {
329
+ "name": "MODEL",
330
+ "type": "MODEL",
331
+ "links": [],
332
+ "slot_index": 0
333
+ },
334
+ {
335
+ "name": "CLIP",
336
+ "type": "CLIP",
337
+ "links": [
338
+ 93,
339
+ 94
340
+ ],
341
+ "slot_index": 1
342
+ },
343
+ {
344
+ "name": "VAE",
345
+ "type": "VAE",
346
+ "links": [
347
+ 8
348
+ ],
349
+ "slot_index": 2
350
+ }
351
+ ],
352
+ "properties": {
353
+ "Node name for S&R": "CheckpointLoaderSimple"
354
+ },
355
+ "widgets_values": [
356
+ "sdxl/sd_xl_base_1.0_0.9vae.safetensors"
357
+ ]
358
+ },
359
+ {
360
+ "id": 41,
361
+ "type": "UNETLoader",
362
+ "pos": [
363
+ -130,
364
+ -193
365
+ ],
366
+ "size": {
367
+ "0": 315,
368
+ "1": 58
369
+ },
370
+ "flags": {},
371
+ "order": 6,
372
+ "mode": 0,
373
+ "outputs": [
374
+ {
375
+ "name": "MODEL",
376
+ "type": "MODEL",
377
+ "links": [
378
+ 128
379
+ ],
380
+ "shape": 3,
381
+ "slot_index": 0
382
+ }
383
+ ],
384
+ "properties": {
385
+ "Node name for S&R": "UNETLoader"
386
+ },
387
+ "widgets_values": [
388
+ "sdxl_lightning_4step_unet.safetensors"
389
+ ]
390
+ },
391
+ {
392
+ "id": 33,
393
+ "type": "ApplyPulid",
394
+ "pos": [
395
+ 350,
396
+ -10
397
+ ],
398
+ "size": {
399
+ "0": 315,
400
+ "1": 210
401
+ },
402
+ "flags": {},
403
+ "order": 9,
404
+ "mode": 0,
405
+ "inputs": [
406
+ {
407
+ "name": "model",
408
+ "type": "MODEL",
409
+ "link": 128
410
+ },
411
+ {
412
+ "name": "pulid",
413
+ "type": "PULID",
414
+ "link": 117
415
+ },
416
+ {
417
+ "name": "eva_clip",
418
+ "type": "EVA_CLIP",
419
+ "link": 81
420
+ },
421
+ {
422
+ "name": "face_analysis",
423
+ "type": "FACEANALYSIS",
424
+ "link": 82
425
+ },
426
+ {
427
+ "name": "image",
428
+ "type": "IMAGE",
429
+ "link": 114
430
+ }
431
+ ],
432
+ "outputs": [
433
+ {
434
+ "name": "MODEL",
435
+ "type": "MODEL",
436
+ "links": [
437
+ 120
438
+ ],
439
+ "shape": 3,
440
+ "slot_index": 0
441
+ }
442
+ ],
443
+ "properties": {
444
+ "Node name for S&R": "ApplyPulid"
445
+ },
446
+ "widgets_values": [
447
+ "fidelity",
448
+ 0.8,
449
+ 0,
450
+ 1
451
+ ]
452
+ },
453
+ {
454
+ "id": 3,
455
+ "type": "KSampler",
456
+ "pos": [
457
+ 800,
458
+ -270
459
+ ],
460
+ "size": {
461
+ "0": 341.2750244140625,
462
+ "1": 262
463
+ },
464
+ "flags": {},
465
+ "order": 10,
466
+ "mode": 0,
467
+ "inputs": [
468
+ {
469
+ "name": "model",
470
+ "type": "MODEL",
471
+ "link": 120
472
+ },
473
+ {
474
+ "name": "positive",
475
+ "type": "CONDITIONING",
476
+ "link": 35
477
+ },
478
+ {
479
+ "name": "negative",
480
+ "type": "CONDITIONING",
481
+ "link": 34
482
+ },
483
+ {
484
+ "name": "latent_image",
485
+ "type": "LATENT",
486
+ "link": 2
487
+ }
488
+ ],
489
+ "outputs": [
490
+ {
491
+ "name": "LATENT",
492
+ "type": "LATENT",
493
+ "links": [
494
+ 7
495
+ ],
496
+ "slot_index": 0
497
+ }
498
+ ],
499
+ "properties": {
500
+ "Node name for S&R": "KSampler"
501
+ },
502
+ "widgets_values": [
503
+ 42,
504
+ "fixed",
505
+ 4,
506
+ 1.2,
507
+ "dpmpp_2m",
508
+ "sgm_uniform",
509
+ 1
510
+ ]
511
+ }
512
+ ],
513
+ "links": [
514
+ [
515
+ 2,
516
+ 5,
517
+ 0,
518
+ 3,
519
+ 3,
520
+ "LATENT"
521
+ ],
522
+ [
523
+ 7,
524
+ 3,
525
+ 0,
526
+ 8,
527
+ 0,
528
+ "LATENT"
529
+ ],
530
+ [
531
+ 8,
532
+ 4,
533
+ 2,
534
+ 8,
535
+ 1,
536
+ "VAE"
537
+ ],
538
+ [
539
+ 10,
540
+ 8,
541
+ 0,
542
+ 10,
543
+ 0,
544
+ "IMAGE"
545
+ ],
546
+ [
547
+ 34,
548
+ 23,
549
+ 0,
550
+ 3,
551
+ 2,
552
+ "CONDITIONING"
553
+ ],
554
+ [
555
+ 35,
556
+ 22,
557
+ 0,
558
+ 3,
559
+ 1,
560
+ "CONDITIONING"
561
+ ],
562
+ [
563
+ 81,
564
+ 19,
565
+ 0,
566
+ 33,
567
+ 2,
568
+ "EVA_CLIP"
569
+ ],
570
+ [
571
+ 82,
572
+ 17,
573
+ 0,
574
+ 33,
575
+ 3,
576
+ "FACEANALYSIS"
577
+ ],
578
+ [
579
+ 93,
580
+ 4,
581
+ 1,
582
+ 22,
583
+ 0,
584
+ "CLIP"
585
+ ],
586
+ [
587
+ 94,
588
+ 4,
589
+ 1,
590
+ 23,
591
+ 0,
592
+ "CLIP"
593
+ ],
594
+ [
595
+ 114,
596
+ 12,
597
+ 0,
598
+ 33,
599
+ 4,
600
+ "IMAGE"
601
+ ],
602
+ [
603
+ 117,
604
+ 16,
605
+ 0,
606
+ 33,
607
+ 1,
608
+ "PULID"
609
+ ],
610
+ [
611
+ 120,
612
+ 33,
613
+ 0,
614
+ 3,
615
+ 0,
616
+ "MODEL"
617
+ ],
618
+ [
619
+ 128,
620
+ 41,
621
+ 0,
622
+ 33,
623
+ 0,
624
+ "MODEL"
625
+ ]
626
+ ],
627
+ "groups": [],
628
+ "config": {},
629
+ "extra": {},
630
+ "version": 0.4
631
+ }
PuLID_ComfyUI/examples/PuLID_IPAdapter_style_transfer.json ADDED
@@ -0,0 +1,794 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 48,
3
+ "last_link_id": 139,
4
+ "nodes": [
5
+ {
6
+ "id": 19,
7
+ "type": "PulidEvaClipLoader",
8
+ "pos": [
9
+ 130,
10
+ 120
11
+ ],
12
+ "size": {
13
+ "0": 140,
14
+ "1": 26
15
+ },
16
+ "flags": {},
17
+ "order": 0,
18
+ "mode": 0,
19
+ "outputs": [
20
+ {
21
+ "name": "EVA_CLIP",
22
+ "type": "EVA_CLIP",
23
+ "links": [
24
+ 81
25
+ ],
26
+ "shape": 3,
27
+ "slot_index": 0
28
+ }
29
+ ],
30
+ "properties": {
31
+ "Node name for S&R": "PulidEvaClipLoader"
32
+ }
33
+ },
34
+ {
35
+ "id": 17,
36
+ "type": "PulidInsightFaceLoader",
37
+ "pos": [
38
+ 60,
39
+ 190
40
+ ],
41
+ "size": {
42
+ "0": 210,
43
+ "1": 58
44
+ },
45
+ "flags": {},
46
+ "order": 1,
47
+ "mode": 0,
48
+ "outputs": [
49
+ {
50
+ "name": "FACEANALYSIS",
51
+ "type": "FACEANALYSIS",
52
+ "links": [
53
+ 82
54
+ ],
55
+ "shape": 3,
56
+ "slot_index": 0
57
+ }
58
+ ],
59
+ "properties": {
60
+ "Node name for S&R": "PulidInsightFaceLoader"
61
+ },
62
+ "widgets_values": [
63
+ "CPU"
64
+ ]
65
+ },
66
+ {
67
+ "id": 16,
68
+ "type": "PulidModelLoader",
69
+ "pos": [
70
+ -20,
71
+ 20
72
+ ],
73
+ "size": {
74
+ "0": 304.0072021484375,
75
+ "1": 58
76
+ },
77
+ "flags": {},
78
+ "order": 2,
79
+ "mode": 0,
80
+ "outputs": [
81
+ {
82
+ "name": "PULID",
83
+ "type": "PULID",
84
+ "links": [
85
+ 117
86
+ ],
87
+ "shape": 3,
88
+ "slot_index": 0
89
+ }
90
+ ],
91
+ "properties": {
92
+ "Node name for S&R": "PulidModelLoader"
93
+ },
94
+ "widgets_values": [
95
+ "ip-adapter_pulid_sdxl_fp16.safetensors"
96
+ ]
97
+ },
98
+ {
99
+ "id": 5,
100
+ "type": "EmptyLatentImage",
101
+ "pos": [
102
+ 350,
103
+ 265
104
+ ],
105
+ "size": {
106
+ "0": 315,
107
+ "1": 106
108
+ },
109
+ "flags": {},
110
+ "order": 3,
111
+ "mode": 0,
112
+ "outputs": [
113
+ {
114
+ "name": "LATENT",
115
+ "type": "LATENT",
116
+ "links": [
117
+ 2
118
+ ],
119
+ "slot_index": 0
120
+ }
121
+ ],
122
+ "properties": {
123
+ "Node name for S&R": "EmptyLatentImage"
124
+ },
125
+ "widgets_values": [
126
+ 768,
127
+ 1024,
128
+ 1
129
+ ]
130
+ },
131
+ {
132
+ "id": 23,
133
+ "type": "CLIPTextEncode",
134
+ "pos": [
135
+ 330,
136
+ -260
137
+ ],
138
+ "size": {
139
+ "0": 334.8077697753906,
140
+ "1": 189.35675048828125
141
+ },
142
+ "flags": {},
143
+ "order": 9,
144
+ "mode": 0,
145
+ "inputs": [
146
+ {
147
+ "name": "clip",
148
+ "type": "CLIP",
149
+ "link": 94
150
+ }
151
+ ],
152
+ "outputs": [
153
+ {
154
+ "name": "CONDITIONING",
155
+ "type": "CONDITIONING",
156
+ "links": [
157
+ 34
158
+ ],
159
+ "shape": 3,
160
+ "slot_index": 0
161
+ }
162
+ ],
163
+ "properties": {
164
+ "Node name for S&R": "CLIPTextEncode"
165
+ },
166
+ "widgets_values": [
167
+ "blurry, malformed, low quality, worst quality, artifacts, noise, text, watermark, glitch, deformed, ugly, horror, ill"
168
+ ]
169
+ },
170
+ {
171
+ "id": 22,
172
+ "type": "CLIPTextEncode",
173
+ "pos": [
174
+ 340,
175
+ -430
176
+ ],
177
+ "size": {
178
+ "0": 315.23089599609375,
179
+ "1": 113.96450805664062
180
+ },
181
+ "flags": {},
182
+ "order": 8,
183
+ "mode": 0,
184
+ "inputs": [
185
+ {
186
+ "name": "clip",
187
+ "type": "CLIP",
188
+ "link": 93
189
+ }
190
+ ],
191
+ "outputs": [
192
+ {
193
+ "name": "CONDITIONING",
194
+ "type": "CONDITIONING",
195
+ "links": [
196
+ 35
197
+ ],
198
+ "shape": 3,
199
+ "slot_index": 0
200
+ }
201
+ ],
202
+ "properties": {
203
+ "Node name for S&R": "CLIPTextEncode"
204
+ },
205
+ "widgets_values": [
206
+ "closeup portrait, cyberpunk, cinematic, hoodie, purple hair, highly detailed, 4k, high resolution"
207
+ ]
208
+ },
209
+ {
210
+ "id": 12,
211
+ "type": "LoadImage",
212
+ "pos": [
213
+ -115,
214
+ 310
215
+ ],
216
+ "size": {
217
+ "0": 404.07366943359375,
218
+ "1": 496.2817077636719
219
+ },
220
+ "flags": {},
221
+ "order": 4,
222
+ "mode": 0,
223
+ "outputs": [
224
+ {
225
+ "name": "IMAGE",
226
+ "type": "IMAGE",
227
+ "links": [
228
+ 114
229
+ ],
230
+ "shape": 3,
231
+ "slot_index": 0
232
+ },
233
+ {
234
+ "name": "MASK",
235
+ "type": "MASK",
236
+ "links": null,
237
+ "shape": 3
238
+ }
239
+ ],
240
+ "properties": {
241
+ "Node name for S&R": "LoadImage"
242
+ },
243
+ "widgets_values": [
244
+ "monalisa.png",
245
+ "image"
246
+ ]
247
+ },
248
+ {
249
+ "id": 4,
250
+ "type": "CheckpointLoaderSimple",
251
+ "pos": [
252
+ -97,
253
+ -265
254
+ ],
255
+ "size": {
256
+ "0": 319.03692626953125,
257
+ "1": 101.3391342163086
258
+ },
259
+ "flags": {},
260
+ "order": 5,
261
+ "mode": 0,
262
+ "outputs": [
263
+ {
264
+ "name": "MODEL",
265
+ "type": "MODEL",
266
+ "links": [
267
+ 133
268
+ ],
269
+ "slot_index": 0
270
+ },
271
+ {
272
+ "name": "CLIP",
273
+ "type": "CLIP",
274
+ "links": [
275
+ 93,
276
+ 94
277
+ ],
278
+ "slot_index": 1
279
+ },
280
+ {
281
+ "name": "VAE",
282
+ "type": "VAE",
283
+ "links": [
284
+ 8
285
+ ],
286
+ "slot_index": 2
287
+ }
288
+ ],
289
+ "properties": {
290
+ "Node name for S&R": "CheckpointLoaderSimple"
291
+ },
292
+ "widgets_values": [
293
+ "sdxl/Proteus-RunDiffusion.safetensors"
294
+ ]
295
+ },
296
+ {
297
+ "id": 33,
298
+ "type": "ApplyPulid",
299
+ "pos": [
300
+ 350,
301
+ -10
302
+ ],
303
+ "size": {
304
+ "0": 315,
305
+ "1": 210
306
+ },
307
+ "flags": {},
308
+ "order": 7,
309
+ "mode": 0,
310
+ "inputs": [
311
+ {
312
+ "name": "model",
313
+ "type": "MODEL",
314
+ "link": 133
315
+ },
316
+ {
317
+ "name": "pulid",
318
+ "type": "PULID",
319
+ "link": 117
320
+ },
321
+ {
322
+ "name": "eva_clip",
323
+ "type": "EVA_CLIP",
324
+ "link": 81
325
+ },
326
+ {
327
+ "name": "face_analysis",
328
+ "type": "FACEANALYSIS",
329
+ "link": 82
330
+ },
331
+ {
332
+ "name": "image",
333
+ "type": "IMAGE",
334
+ "link": 114
335
+ }
336
+ ],
337
+ "outputs": [
338
+ {
339
+ "name": "MODEL",
340
+ "type": "MODEL",
341
+ "links": [
342
+ 136
343
+ ],
344
+ "shape": 3,
345
+ "slot_index": 0
346
+ }
347
+ ],
348
+ "properties": {
349
+ "Node name for S&R": "ApplyPulid"
350
+ },
351
+ "widgets_values": [
352
+ "fidelity",
353
+ 0.8,
354
+ 0,
355
+ 1
356
+ ]
357
+ },
358
+ {
359
+ "id": 47,
360
+ "type": "IPAdapterUnifiedLoader",
361
+ "pos": [
362
+ 720,
363
+ -10
364
+ ],
365
+ "size": [
366
+ 245.09423828124943,
367
+ 78
368
+ ],
369
+ "flags": {},
370
+ "order": 10,
371
+ "mode": 0,
372
+ "inputs": [
373
+ {
374
+ "name": "model",
375
+ "type": "MODEL",
376
+ "link": 136
377
+ },
378
+ {
379
+ "name": "ipadapter",
380
+ "type": "IPADAPTER",
381
+ "link": null
382
+ }
383
+ ],
384
+ "outputs": [
385
+ {
386
+ "name": "model",
387
+ "type": "MODEL",
388
+ "links": [
389
+ 137
390
+ ],
391
+ "shape": 3,
392
+ "slot_index": 0
393
+ },
394
+ {
395
+ "name": "ipadapter",
396
+ "type": "IPADAPTER",
397
+ "links": [
398
+ 135
399
+ ],
400
+ "shape": 3
401
+ }
402
+ ],
403
+ "properties": {
404
+ "Node name for S&R": "IPAdapterUnifiedLoader"
405
+ },
406
+ "widgets_values": [
407
+ "PLUS (high strength)"
408
+ ]
409
+ },
410
+ {
411
+ "id": 8,
412
+ "type": "VAEDecode",
413
+ "pos": [
414
+ 1831,
415
+ 16
416
+ ],
417
+ "size": {
418
+ "0": 140,
419
+ "1": 46
420
+ },
421
+ "flags": {},
422
+ "order": 13,
423
+ "mode": 0,
424
+ "inputs": [
425
+ {
426
+ "name": "samples",
427
+ "type": "LATENT",
428
+ "link": 7
429
+ },
430
+ {
431
+ "name": "vae",
432
+ "type": "VAE",
433
+ "link": 8
434
+ }
435
+ ],
436
+ "outputs": [
437
+ {
438
+ "name": "IMAGE",
439
+ "type": "IMAGE",
440
+ "links": [
441
+ 10
442
+ ],
443
+ "slot_index": 0
444
+ }
445
+ ],
446
+ "properties": {
447
+ "Node name for S&R": "VAEDecode"
448
+ }
449
+ },
450
+ {
451
+ "id": 10,
452
+ "type": "PreviewImage",
453
+ "pos": [
454
+ 1817,
455
+ 123
456
+ ],
457
+ "size": [
458
+ 705.6038401281248,
459
+ 950.4616015812499
460
+ ],
461
+ "flags": {},
462
+ "order": 14,
463
+ "mode": 0,
464
+ "inputs": [
465
+ {
466
+ "name": "images",
467
+ "type": "IMAGE",
468
+ "link": 10
469
+ }
470
+ ],
471
+ "properties": {
472
+ "Node name for S&R": "PreviewImage"
473
+ }
474
+ },
475
+ {
476
+ "id": 46,
477
+ "type": "IPAdapterAdvanced",
478
+ "pos": [
479
+ 1033,
480
+ -36
481
+ ],
482
+ "size": {
483
+ "0": 315,
484
+ "1": 278
485
+ },
486
+ "flags": {},
487
+ "order": 11,
488
+ "mode": 0,
489
+ "inputs": [
490
+ {
491
+ "name": "model",
492
+ "type": "MODEL",
493
+ "link": 137
494
+ },
495
+ {
496
+ "name": "ipadapter",
497
+ "type": "IPADAPTER",
498
+ "link": 135,
499
+ "slot_index": 1
500
+ },
501
+ {
502
+ "name": "image",
503
+ "type": "IMAGE",
504
+ "link": 139,
505
+ "slot_index": 2
506
+ },
507
+ {
508
+ "name": "image_negative",
509
+ "type": "IMAGE",
510
+ "link": null
511
+ },
512
+ {
513
+ "name": "attn_mask",
514
+ "type": "MASK",
515
+ "link": null
516
+ },
517
+ {
518
+ "name": "clip_vision",
519
+ "type": "CLIP_VISION",
520
+ "link": null
521
+ }
522
+ ],
523
+ "outputs": [
524
+ {
525
+ "name": "MODEL",
526
+ "type": "MODEL",
527
+ "links": [
528
+ 138
529
+ ],
530
+ "shape": 3,
531
+ "slot_index": 0
532
+ }
533
+ ],
534
+ "properties": {
535
+ "Node name for S&R": "IPAdapterAdvanced"
536
+ },
537
+ "widgets_values": [
538
+ 1,
539
+ "style transfer",
540
+ "concat",
541
+ 0,
542
+ 1,
543
+ "V only"
544
+ ]
545
+ },
546
+ {
547
+ "id": 48,
548
+ "type": "LoadImage",
549
+ "pos": [
550
+ 1032,
551
+ 303
552
+ ],
553
+ "size": [
554
+ 315,
555
+ 314
556
+ ],
557
+ "flags": {},
558
+ "order": 6,
559
+ "mode": 0,
560
+ "outputs": [
561
+ {
562
+ "name": "IMAGE",
563
+ "type": "IMAGE",
564
+ "links": [
565
+ 139
566
+ ],
567
+ "shape": 3
568
+ },
569
+ {
570
+ "name": "MASK",
571
+ "type": "MASK",
572
+ "links": null,
573
+ "shape": 3
574
+ }
575
+ ],
576
+ "properties": {
577
+ "Node name for S&R": "LoadImage"
578
+ },
579
+ "widgets_values": [
580
+ "anime_illustration.png",
581
+ "image"
582
+ ]
583
+ },
584
+ {
585
+ "id": 3,
586
+ "type": "KSampler",
587
+ "pos": [
588
+ 1413,
589
+ 12
590
+ ],
591
+ "size": {
592
+ "0": 341.2750244140625,
593
+ "1": 262
594
+ },
595
+ "flags": {},
596
+ "order": 12,
597
+ "mode": 0,
598
+ "inputs": [
599
+ {
600
+ "name": "model",
601
+ "type": "MODEL",
602
+ "link": 138
603
+ },
604
+ {
605
+ "name": "positive",
606
+ "type": "CONDITIONING",
607
+ "link": 35
608
+ },
609
+ {
610
+ "name": "negative",
611
+ "type": "CONDITIONING",
612
+ "link": 34
613
+ },
614
+ {
615
+ "name": "latent_image",
616
+ "type": "LATENT",
617
+ "link": 2
618
+ }
619
+ ],
620
+ "outputs": [
621
+ {
622
+ "name": "LATENT",
623
+ "type": "LATENT",
624
+ "links": [
625
+ 7
626
+ ],
627
+ "slot_index": 0
628
+ }
629
+ ],
630
+ "properties": {
631
+ "Node name for S&R": "KSampler"
632
+ },
633
+ "widgets_values": [
634
+ 52,
635
+ "fixed",
636
+ 30,
637
+ 6,
638
+ "dpmpp_2m",
639
+ "sgm_uniform",
640
+ 1
641
+ ]
642
+ }
643
+ ],
644
+ "links": [
645
+ [
646
+ 2,
647
+ 5,
648
+ 0,
649
+ 3,
650
+ 3,
651
+ "LATENT"
652
+ ],
653
+ [
654
+ 7,
655
+ 3,
656
+ 0,
657
+ 8,
658
+ 0,
659
+ "LATENT"
660
+ ],
661
+ [
662
+ 8,
663
+ 4,
664
+ 2,
665
+ 8,
666
+ 1,
667
+ "VAE"
668
+ ],
669
+ [
670
+ 10,
671
+ 8,
672
+ 0,
673
+ 10,
674
+ 0,
675
+ "IMAGE"
676
+ ],
677
+ [
678
+ 34,
679
+ 23,
680
+ 0,
681
+ 3,
682
+ 2,
683
+ "CONDITIONING"
684
+ ],
685
+ [
686
+ 35,
687
+ 22,
688
+ 0,
689
+ 3,
690
+ 1,
691
+ "CONDITIONING"
692
+ ],
693
+ [
694
+ 81,
695
+ 19,
696
+ 0,
697
+ 33,
698
+ 2,
699
+ "EVA_CLIP"
700
+ ],
701
+ [
702
+ 82,
703
+ 17,
704
+ 0,
705
+ 33,
706
+ 3,
707
+ "FACEANALYSIS"
708
+ ],
709
+ [
710
+ 93,
711
+ 4,
712
+ 1,
713
+ 22,
714
+ 0,
715
+ "CLIP"
716
+ ],
717
+ [
718
+ 94,
719
+ 4,
720
+ 1,
721
+ 23,
722
+ 0,
723
+ "CLIP"
724
+ ],
725
+ [
726
+ 114,
727
+ 12,
728
+ 0,
729
+ 33,
730
+ 4,
731
+ "IMAGE"
732
+ ],
733
+ [
734
+ 117,
735
+ 16,
736
+ 0,
737
+ 33,
738
+ 1,
739
+ "PULID"
740
+ ],
741
+ [
742
+ 133,
743
+ 4,
744
+ 0,
745
+ 33,
746
+ 0,
747
+ "MODEL"
748
+ ],
749
+ [
750
+ 135,
751
+ 47,
752
+ 1,
753
+ 46,
754
+ 1,
755
+ "IPADAPTER"
756
+ ],
757
+ [
758
+ 136,
759
+ 33,
760
+ 0,
761
+ 47,
762
+ 0,
763
+ "MODEL"
764
+ ],
765
+ [
766
+ 137,
767
+ 47,
768
+ 0,
769
+ 46,
770
+ 0,
771
+ "MODEL"
772
+ ],
773
+ [
774
+ 138,
775
+ 46,
776
+ 0,
777
+ 3,
778
+ 0,
779
+ "MODEL"
780
+ ],
781
+ [
782
+ 139,
783
+ 48,
784
+ 0,
785
+ 46,
786
+ 2,
787
+ "IMAGE"
788
+ ]
789
+ ],
790
+ "groups": [],
791
+ "config": {},
792
+ "extra": {},
793
+ "version": 0.4
794
+ }
PuLID_ComfyUI/examples/PuLID_attention_mask.json ADDED
@@ -0,0 +1,946 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 88,
3
+ "last_link_id": 248,
4
+ "nodes": [
5
+ {
6
+ "id": 5,
7
+ "type": "EmptyLatentImage",
8
+ "pos": [
9
+ 350,
10
+ 265
11
+ ],
12
+ "size": {
13
+ "0": 315,
14
+ "1": 106
15
+ },
16
+ "flags": {},
17
+ "order": 0,
18
+ "mode": 0,
19
+ "outputs": [
20
+ {
21
+ "name": "LATENT",
22
+ "type": "LATENT",
23
+ "links": [
24
+ 2
25
+ ],
26
+ "slot_index": 0
27
+ }
28
+ ],
29
+ "properties": {
30
+ "Node name for S&R": "EmptyLatentImage"
31
+ },
32
+ "widgets_values": [
33
+ 1280,
34
+ 960,
35
+ 1
36
+ ]
37
+ },
38
+ {
39
+ "id": 33,
40
+ "type": "ApplyPulid",
41
+ "pos": [
42
+ 350,
43
+ -10
44
+ ],
45
+ "size": {
46
+ "0": 315,
47
+ "1": 230
48
+ },
49
+ "flags": {},
50
+ "order": 13,
51
+ "mode": 0,
52
+ "inputs": [
53
+ {
54
+ "name": "model",
55
+ "type": "MODEL",
56
+ "link": 133
57
+ },
58
+ {
59
+ "name": "pulid",
60
+ "type": "PULID",
61
+ "link": 117
62
+ },
63
+ {
64
+ "name": "eva_clip",
65
+ "type": "EVA_CLIP",
66
+ "link": 81
67
+ },
68
+ {
69
+ "name": "face_analysis",
70
+ "type": "FACEANALYSIS",
71
+ "link": 82
72
+ },
73
+ {
74
+ "name": "image",
75
+ "type": "IMAGE",
76
+ "link": 114
77
+ },
78
+ {
79
+ "name": "attn_mask",
80
+ "type": "MASK",
81
+ "link": 247
82
+ }
83
+ ],
84
+ "outputs": [
85
+ {
86
+ "name": "MODEL",
87
+ "type": "MODEL",
88
+ "links": [
89
+ 141
90
+ ],
91
+ "shape": 3,
92
+ "slot_index": 0
93
+ }
94
+ ],
95
+ "properties": {
96
+ "Node name for S&R": "ApplyPulid"
97
+ },
98
+ "widgets_values": [
99
+ "fidelity",
100
+ 0.7000000000000001,
101
+ 0,
102
+ 1
103
+ ]
104
+ },
105
+ {
106
+ "id": 85,
107
+ "type": "SolidMask",
108
+ "pos": [
109
+ -307,
110
+ 584
111
+ ],
112
+ "size": [
113
+ 210,
114
+ 106
115
+ ],
116
+ "flags": {},
117
+ "order": 1,
118
+ "mode": 0,
119
+ "outputs": [
120
+ {
121
+ "name": "MASK",
122
+ "type": "MASK",
123
+ "links": [
124
+ 244
125
+ ],
126
+ "shape": 3,
127
+ "slot_index": 0
128
+ }
129
+ ],
130
+ "properties": {
131
+ "Node name for S&R": "SolidMask"
132
+ },
133
+ "widgets_values": [
134
+ 0,
135
+ 1280,
136
+ 960
137
+ ]
138
+ },
139
+ {
140
+ "id": 49,
141
+ "type": "LoadImage",
142
+ "pos": [
143
+ 407,
144
+ 550
145
+ ],
146
+ "size": [
147
+ 248.03589794921936,
148
+ 339.7795556640626
149
+ ],
150
+ "flags": {},
151
+ "order": 2,
152
+ "mode": 0,
153
+ "outputs": [
154
+ {
155
+ "name": "IMAGE",
156
+ "type": "IMAGE",
157
+ "links": [
158
+ 145
159
+ ],
160
+ "shape": 3,
161
+ "slot_index": 0
162
+ },
163
+ {
164
+ "name": "MASK",
165
+ "type": "MASK",
166
+ "links": null,
167
+ "shape": 3
168
+ }
169
+ ],
170
+ "properties": {
171
+ "Node name for S&R": "LoadImage"
172
+ },
173
+ "widgets_values": [
174
+ "venere.jpg",
175
+ "image"
176
+ ]
177
+ },
178
+ {
179
+ "id": 48,
180
+ "type": "InvertMask",
181
+ "pos": [
182
+ 526,
183
+ 438
184
+ ],
185
+ "size": [
186
+ 140,
187
+ 26
188
+ ],
189
+ "flags": {},
190
+ "order": 12,
191
+ "mode": 0,
192
+ "inputs": [
193
+ {
194
+ "name": "mask",
195
+ "type": "MASK",
196
+ "link": 246
197
+ }
198
+ ],
199
+ "outputs": [
200
+ {
201
+ "name": "MASK",
202
+ "type": "MASK",
203
+ "links": [
204
+ 151
205
+ ],
206
+ "shape": 3,
207
+ "slot_index": 0
208
+ }
209
+ ],
210
+ "properties": {
211
+ "Node name for S&R": "InvertMask"
212
+ }
213
+ },
214
+ {
215
+ "id": 8,
216
+ "type": "VAEDecode",
217
+ "pos": [
218
+ 1575,
219
+ 160
220
+ ],
221
+ "size": {
222
+ "0": 140,
223
+ "1": 46
224
+ },
225
+ "flags": {},
226
+ "order": 16,
227
+ "mode": 0,
228
+ "inputs": [
229
+ {
230
+ "name": "samples",
231
+ "type": "LATENT",
232
+ "link": 7
233
+ },
234
+ {
235
+ "name": "vae",
236
+ "type": "VAE",
237
+ "link": 8
238
+ }
239
+ ],
240
+ "outputs": [
241
+ {
242
+ "name": "IMAGE",
243
+ "type": "IMAGE",
244
+ "links": [
245
+ 10
246
+ ],
247
+ "slot_index": 0
248
+ }
249
+ ],
250
+ "properties": {
251
+ "Node name for S&R": "VAEDecode"
252
+ }
253
+ },
254
+ {
255
+ "id": 10,
256
+ "type": "PreviewImage",
257
+ "pos": [
258
+ 1592,
259
+ 279
260
+ ],
261
+ "size": [
262
+ 1370.7157657734379,
263
+ 1041.8039240156252
264
+ ],
265
+ "flags": {},
266
+ "order": 17,
267
+ "mode": 0,
268
+ "inputs": [
269
+ {
270
+ "name": "images",
271
+ "type": "IMAGE",
272
+ "link": 10
273
+ }
274
+ ],
275
+ "properties": {
276
+ "Node name for S&R": "PreviewImage"
277
+ }
278
+ },
279
+ {
280
+ "id": 16,
281
+ "type": "PulidModelLoader",
282
+ "pos": [
283
+ -111,
284
+ -181
285
+ ],
286
+ "size": {
287
+ "0": 304.0072021484375,
288
+ "1": 58
289
+ },
290
+ "flags": {},
291
+ "order": 3,
292
+ "mode": 0,
293
+ "outputs": [
294
+ {
295
+ "name": "PULID",
296
+ "type": "PULID",
297
+ "links": [
298
+ 117,
299
+ 136
300
+ ],
301
+ "shape": 3,
302
+ "slot_index": 0
303
+ }
304
+ ],
305
+ "properties": {
306
+ "Node name for S&R": "PulidModelLoader"
307
+ },
308
+ "widgets_values": [
309
+ "ip-adapter_pulid_sdxl_fp16.safetensors"
310
+ ]
311
+ },
312
+ {
313
+ "id": 19,
314
+ "type": "PulidEvaClipLoader",
315
+ "pos": [
316
+ 54,
317
+ -69
318
+ ],
319
+ "size": {
320
+ "0": 140,
321
+ "1": 26
322
+ },
323
+ "flags": {},
324
+ "order": 4,
325
+ "mode": 0,
326
+ "outputs": [
327
+ {
328
+ "name": "EVA_CLIP",
329
+ "type": "EVA_CLIP",
330
+ "links": [
331
+ 81,
332
+ 137
333
+ ],
334
+ "shape": 3,
335
+ "slot_index": 0
336
+ }
337
+ ],
338
+ "properties": {
339
+ "Node name for S&R": "PulidEvaClipLoader"
340
+ }
341
+ },
342
+ {
343
+ "id": 17,
344
+ "type": "PulidInsightFaceLoader",
345
+ "pos": [
346
+ -18,
347
+ 12
348
+ ],
349
+ "size": {
350
+ "0": 210,
351
+ "1": 58
352
+ },
353
+ "flags": {},
354
+ "order": 5,
355
+ "mode": 0,
356
+ "outputs": [
357
+ {
358
+ "name": "FACEANALYSIS",
359
+ "type": "FACEANALYSIS",
360
+ "links": [
361
+ 82,
362
+ 138
363
+ ],
364
+ "shape": 3,
365
+ "slot_index": 0
366
+ }
367
+ ],
368
+ "properties": {
369
+ "Node name for S&R": "PulidInsightFaceLoader"
370
+ },
371
+ "widgets_values": [
372
+ "CPU"
373
+ ]
374
+ },
375
+ {
376
+ "id": 12,
377
+ "type": "LoadImage",
378
+ "pos": [
379
+ -34,
380
+ 145
381
+ ],
382
+ "size": [
383
+ 261.645185990767,
384
+ 346.38255171342325
385
+ ],
386
+ "flags": {},
387
+ "order": 6,
388
+ "mode": 0,
389
+ "outputs": [
390
+ {
391
+ "name": "IMAGE",
392
+ "type": "IMAGE",
393
+ "links": [
394
+ 114
395
+ ],
396
+ "shape": 3,
397
+ "slot_index": 0
398
+ },
399
+ {
400
+ "name": "MASK",
401
+ "type": "MASK",
402
+ "links": null,
403
+ "shape": 3
404
+ }
405
+ ],
406
+ "properties": {
407
+ "Node name for S&R": "LoadImage"
408
+ },
409
+ "widgets_values": [
410
+ "monalisa.png",
411
+ "image"
412
+ ]
413
+ },
414
+ {
415
+ "id": 87,
416
+ "type": "MaskComposite",
417
+ "pos": [
418
+ 15,
419
+ 546
420
+ ],
421
+ "size": [
422
+ 210,
423
+ 126
424
+ ],
425
+ "flags": {},
426
+ "order": 9,
427
+ "mode": 0,
428
+ "inputs": [
429
+ {
430
+ "name": "destination",
431
+ "type": "MASK",
432
+ "link": 244
433
+ },
434
+ {
435
+ "name": "source",
436
+ "type": "MASK",
437
+ "link": 245
438
+ }
439
+ ],
440
+ "outputs": [
441
+ {
442
+ "name": "MASK",
443
+ "type": "MASK",
444
+ "links": [
445
+ 246,
446
+ 247
447
+ ],
448
+ "shape": 3,
449
+ "slot_index": 0
450
+ }
451
+ ],
452
+ "properties": {
453
+ "Node name for S&R": "MaskComposite"
454
+ },
455
+ "widgets_values": [
456
+ 0,
457
+ 0,
458
+ "add"
459
+ ]
460
+ },
461
+ {
462
+ "id": 86,
463
+ "type": "SolidMask",
464
+ "pos": [
465
+ -304,
466
+ 747
467
+ ],
468
+ "size": {
469
+ "0": 210,
470
+ "1": 106
471
+ },
472
+ "flags": {},
473
+ "order": 7,
474
+ "mode": 0,
475
+ "outputs": [
476
+ {
477
+ "name": "MASK",
478
+ "type": "MASK",
479
+ "links": [
480
+ 245
481
+ ],
482
+ "shape": 3,
483
+ "slot_index": 0
484
+ }
485
+ ],
486
+ "properties": {
487
+ "Node name for S&R": "SolidMask"
488
+ },
489
+ "widgets_values": [
490
+ 1,
491
+ 640,
492
+ 960
493
+ ]
494
+ },
495
+ {
496
+ "id": 23,
497
+ "type": "CLIPTextEncode",
498
+ "pos": [
499
+ 756,
500
+ -47
501
+ ],
502
+ "size": [
503
+ 316.32471195096673,
504
+ 101.97065006593618
505
+ ],
506
+ "flags": {},
507
+ "order": 10,
508
+ "mode": 0,
509
+ "inputs": [
510
+ {
511
+ "name": "clip",
512
+ "type": "CLIP",
513
+ "link": 94
514
+ }
515
+ ],
516
+ "outputs": [
517
+ {
518
+ "name": "CONDITIONING",
519
+ "type": "CONDITIONING",
520
+ "links": [
521
+ 34
522
+ ],
523
+ "shape": 3,
524
+ "slot_index": 0
525
+ }
526
+ ],
527
+ "properties": {
528
+ "Node name for S&R": "CLIPTextEncode"
529
+ },
530
+ "widgets_values": [
531
+ "blurry, malformed, low quality, worst quality, artifacts, noise, text, watermark, glitch, deformed, ugly, horror, ill"
532
+ ]
533
+ },
534
+ {
535
+ "id": 47,
536
+ "type": "ApplyPulid",
537
+ "pos": [
538
+ 765,
539
+ 128
540
+ ],
541
+ "size": {
542
+ "0": 315,
543
+ "1": 230
544
+ },
545
+ "flags": {},
546
+ "order": 14,
547
+ "mode": 0,
548
+ "inputs": [
549
+ {
550
+ "name": "model",
551
+ "type": "MODEL",
552
+ "link": 141
553
+ },
554
+ {
555
+ "name": "pulid",
556
+ "type": "PULID",
557
+ "link": 136
558
+ },
559
+ {
560
+ "name": "eva_clip",
561
+ "type": "EVA_CLIP",
562
+ "link": 137
563
+ },
564
+ {
565
+ "name": "face_analysis",
566
+ "type": "FACEANALYSIS",
567
+ "link": 138
568
+ },
569
+ {
570
+ "name": "image",
571
+ "type": "IMAGE",
572
+ "link": 145
573
+ },
574
+ {
575
+ "name": "attn_mask",
576
+ "type": "MASK",
577
+ "link": 151
578
+ }
579
+ ],
580
+ "outputs": [
581
+ {
582
+ "name": "MODEL",
583
+ "type": "MODEL",
584
+ "links": [
585
+ 142
586
+ ],
587
+ "shape": 3,
588
+ "slot_index": 0
589
+ }
590
+ ],
591
+ "properties": {
592
+ "Node name for S&R": "ApplyPulid"
593
+ },
594
+ "widgets_values": [
595
+ "fidelity",
596
+ 0.7000000000000001,
597
+ 0,
598
+ 1
599
+ ]
600
+ },
601
+ {
602
+ "id": 55,
603
+ "type": "CLIPTextEncode",
604
+ "pos": [
605
+ 755,
606
+ -211
607
+ ],
608
+ "size": {
609
+ "0": 315.23089599609375,
610
+ "1": 113.96450805664062
611
+ },
612
+ "flags": {},
613
+ "order": 11,
614
+ "mode": 0,
615
+ "inputs": [
616
+ {
617
+ "name": "clip",
618
+ "type": "CLIP",
619
+ "link": 156
620
+ }
621
+ ],
622
+ "outputs": [
623
+ {
624
+ "name": "CONDITIONING",
625
+ "type": "CONDITIONING",
626
+ "links": [
627
+ 160
628
+ ],
629
+ "shape": 3,
630
+ "slot_index": 0
631
+ }
632
+ ],
633
+ "properties": {
634
+ "Node name for S&R": "CLIPTextEncode"
635
+ },
636
+ "widgets_values": [
637
+ "closeup two girl friends on the streets of a cyberpunk city, cinematic, hoodie, multicolored hair, highly detailed, 4k, high resolution"
638
+ ]
639
+ },
640
+ {
641
+ "id": 3,
642
+ "type": "KSampler",
643
+ "pos": [
644
+ 1162,
645
+ 38
646
+ ],
647
+ "size": {
648
+ "0": 341.2750244140625,
649
+ "1": 262
650
+ },
651
+ "flags": {},
652
+ "order": 15,
653
+ "mode": 0,
654
+ "inputs": [
655
+ {
656
+ "name": "model",
657
+ "type": "MODEL",
658
+ "link": 142
659
+ },
660
+ {
661
+ "name": "positive",
662
+ "type": "CONDITIONING",
663
+ "link": 160
664
+ },
665
+ {
666
+ "name": "negative",
667
+ "type": "CONDITIONING",
668
+ "link": 34
669
+ },
670
+ {
671
+ "name": "latent_image",
672
+ "type": "LATENT",
673
+ "link": 2
674
+ }
675
+ ],
676
+ "outputs": [
677
+ {
678
+ "name": "LATENT",
679
+ "type": "LATENT",
680
+ "links": [
681
+ 7
682
+ ],
683
+ "slot_index": 0
684
+ }
685
+ ],
686
+ "properties": {
687
+ "Node name for S&R": "KSampler"
688
+ },
689
+ "widgets_values": [
690
+ 70,
691
+ "fixed",
692
+ 30,
693
+ 6,
694
+ "dpmpp_2m",
695
+ "karras",
696
+ 1
697
+ ]
698
+ },
699
+ {
700
+ "id": 4,
701
+ "type": "CheckpointLoaderSimple",
702
+ "pos": [
703
+ -131,
704
+ -342
705
+ ],
706
+ "size": {
707
+ "0": 319.03692626953125,
708
+ "1": 101.3391342163086
709
+ },
710
+ "flags": {},
711
+ "order": 8,
712
+ "mode": 0,
713
+ "outputs": [
714
+ {
715
+ "name": "MODEL",
716
+ "type": "MODEL",
717
+ "links": [
718
+ 133
719
+ ],
720
+ "slot_index": 0
721
+ },
722
+ {
723
+ "name": "CLIP",
724
+ "type": "CLIP",
725
+ "links": [
726
+ 94,
727
+ 156
728
+ ],
729
+ "slot_index": 1
730
+ },
731
+ {
732
+ "name": "VAE",
733
+ "type": "VAE",
734
+ "links": [
735
+ 8
736
+ ],
737
+ "slot_index": 2
738
+ }
739
+ ],
740
+ "properties": {
741
+ "Node name for S&R": "CheckpointLoaderSimple"
742
+ },
743
+ "widgets_values": [
744
+ "sdxl/AlbedoBaseXL.safetensors"
745
+ ]
746
+ }
747
+ ],
748
+ "links": [
749
+ [
750
+ 2,
751
+ 5,
752
+ 0,
753
+ 3,
754
+ 3,
755
+ "LATENT"
756
+ ],
757
+ [
758
+ 7,
759
+ 3,
760
+ 0,
761
+ 8,
762
+ 0,
763
+ "LATENT"
764
+ ],
765
+ [
766
+ 8,
767
+ 4,
768
+ 2,
769
+ 8,
770
+ 1,
771
+ "VAE"
772
+ ],
773
+ [
774
+ 10,
775
+ 8,
776
+ 0,
777
+ 10,
778
+ 0,
779
+ "IMAGE"
780
+ ],
781
+ [
782
+ 34,
783
+ 23,
784
+ 0,
785
+ 3,
786
+ 2,
787
+ "CONDITIONING"
788
+ ],
789
+ [
790
+ 81,
791
+ 19,
792
+ 0,
793
+ 33,
794
+ 2,
795
+ "EVA_CLIP"
796
+ ],
797
+ [
798
+ 82,
799
+ 17,
800
+ 0,
801
+ 33,
802
+ 3,
803
+ "FACEANALYSIS"
804
+ ],
805
+ [
806
+ 94,
807
+ 4,
808
+ 1,
809
+ 23,
810
+ 0,
811
+ "CLIP"
812
+ ],
813
+ [
814
+ 114,
815
+ 12,
816
+ 0,
817
+ 33,
818
+ 4,
819
+ "IMAGE"
820
+ ],
821
+ [
822
+ 117,
823
+ 16,
824
+ 0,
825
+ 33,
826
+ 1,
827
+ "PULID"
828
+ ],
829
+ [
830
+ 133,
831
+ 4,
832
+ 0,
833
+ 33,
834
+ 0,
835
+ "MODEL"
836
+ ],
837
+ [
838
+ 136,
839
+ 16,
840
+ 0,
841
+ 47,
842
+ 1,
843
+ "PULID"
844
+ ],
845
+ [
846
+ 137,
847
+ 19,
848
+ 0,
849
+ 47,
850
+ 2,
851
+ "EVA_CLIP"
852
+ ],
853
+ [
854
+ 138,
855
+ 17,
856
+ 0,
857
+ 47,
858
+ 3,
859
+ "FACEANALYSIS"
860
+ ],
861
+ [
862
+ 141,
863
+ 33,
864
+ 0,
865
+ 47,
866
+ 0,
867
+ "MODEL"
868
+ ],
869
+ [
870
+ 142,
871
+ 47,
872
+ 0,
873
+ 3,
874
+ 0,
875
+ "MODEL"
876
+ ],
877
+ [
878
+ 145,
879
+ 49,
880
+ 0,
881
+ 47,
882
+ 4,
883
+ "IMAGE"
884
+ ],
885
+ [
886
+ 151,
887
+ 48,
888
+ 0,
889
+ 47,
890
+ 5,
891
+ "MASK"
892
+ ],
893
+ [
894
+ 156,
895
+ 4,
896
+ 1,
897
+ 55,
898
+ 0,
899
+ "CLIP"
900
+ ],
901
+ [
902
+ 160,
903
+ 55,
904
+ 0,
905
+ 3,
906
+ 1,
907
+ "CONDITIONING"
908
+ ],
909
+ [
910
+ 244,
911
+ 85,
912
+ 0,
913
+ 87,
914
+ 0,
915
+ "MASK"
916
+ ],
917
+ [
918
+ 245,
919
+ 86,
920
+ 0,
921
+ 87,
922
+ 1,
923
+ "MASK"
924
+ ],
925
+ [
926
+ 246,
927
+ 87,
928
+ 0,
929
+ 48,
930
+ 0,
931
+ "MASK"
932
+ ],
933
+ [
934
+ 247,
935
+ 87,
936
+ 0,
937
+ 33,
938
+ 5,
939
+ "MASK"
940
+ ]
941
+ ],
942
+ "groups": [],
943
+ "config": {},
944
+ "extra": {},
945
+ "version": 0.4
946
+ }
PuLID_ComfyUI/examples/PuLID_lightning_lora.json ADDED
@@ -0,0 +1,649 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 45,
3
+ "last_link_id": 132,
4
+ "nodes": [
5
+ {
6
+ "id": 8,
7
+ "type": "VAEDecode",
8
+ "pos": [
9
+ 1210,
10
+ -270
11
+ ],
12
+ "size": {
13
+ "0": 140,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 11,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "samples",
22
+ "type": "LATENT",
23
+ "link": 7
24
+ },
25
+ {
26
+ "name": "vae",
27
+ "type": "VAE",
28
+ "link": 8
29
+ }
30
+ ],
31
+ "outputs": [
32
+ {
33
+ "name": "IMAGE",
34
+ "type": "IMAGE",
35
+ "links": [
36
+ 10
37
+ ],
38
+ "slot_index": 0
39
+ }
40
+ ],
41
+ "properties": {
42
+ "Node name for S&R": "VAEDecode"
43
+ }
44
+ },
45
+ {
46
+ "id": 19,
47
+ "type": "PulidEvaClipLoader",
48
+ "pos": [
49
+ 130,
50
+ 120
51
+ ],
52
+ "size": {
53
+ "0": 140,
54
+ "1": 26
55
+ },
56
+ "flags": {},
57
+ "order": 0,
58
+ "mode": 0,
59
+ "outputs": [
60
+ {
61
+ "name": "EVA_CLIP",
62
+ "type": "EVA_CLIP",
63
+ "links": [
64
+ 81
65
+ ],
66
+ "shape": 3,
67
+ "slot_index": 0
68
+ }
69
+ ],
70
+ "properties": {
71
+ "Node name for S&R": "PulidEvaClipLoader"
72
+ }
73
+ },
74
+ {
75
+ "id": 17,
76
+ "type": "PulidInsightFaceLoader",
77
+ "pos": [
78
+ 60,
79
+ 190
80
+ ],
81
+ "size": {
82
+ "0": 210,
83
+ "1": 58
84
+ },
85
+ "flags": {},
86
+ "order": 1,
87
+ "mode": 0,
88
+ "outputs": [
89
+ {
90
+ "name": "FACEANALYSIS",
91
+ "type": "FACEANALYSIS",
92
+ "links": [
93
+ 82
94
+ ],
95
+ "shape": 3,
96
+ "slot_index": 0
97
+ }
98
+ ],
99
+ "properties": {
100
+ "Node name for S&R": "PulidInsightFaceLoader"
101
+ },
102
+ "widgets_values": [
103
+ "CPU"
104
+ ]
105
+ },
106
+ {
107
+ "id": 23,
108
+ "type": "CLIPTextEncode",
109
+ "pos": [
110
+ 330,
111
+ -260
112
+ ],
113
+ "size": {
114
+ "0": 334.8077697753906,
115
+ "1": 189.35675048828125
116
+ },
117
+ "flags": {},
118
+ "order": 8,
119
+ "mode": 0,
120
+ "inputs": [
121
+ {
122
+ "name": "clip",
123
+ "type": "CLIP",
124
+ "link": 94
125
+ }
126
+ ],
127
+ "outputs": [
128
+ {
129
+ "name": "CONDITIONING",
130
+ "type": "CONDITIONING",
131
+ "links": [
132
+ 34
133
+ ],
134
+ "shape": 3,
135
+ "slot_index": 0
136
+ }
137
+ ],
138
+ "properties": {
139
+ "Node name for S&R": "CLIPTextEncode"
140
+ },
141
+ "widgets_values": [
142
+ "flaws in the eyes, flaws in the face, flaws, lowres, non-HDRi, low quality, worst quality,artifacts noise, text, watermark, glitch, deformed, mutated, ugly, disfigured, hands, low resolution, partially rendered objects, deformed or partially rendered eyes, deformed, deformed eyeballs, cross-eyed,blurry"
143
+ ]
144
+ },
145
+ {
146
+ "id": 22,
147
+ "type": "CLIPTextEncode",
148
+ "pos": [
149
+ 340,
150
+ -430
151
+ ],
152
+ "size": {
153
+ "0": 315.23089599609375,
154
+ "1": 113.96450805664062
155
+ },
156
+ "flags": {},
157
+ "order": 7,
158
+ "mode": 0,
159
+ "inputs": [
160
+ {
161
+ "name": "clip",
162
+ "type": "CLIP",
163
+ "link": 93
164
+ }
165
+ ],
166
+ "outputs": [
167
+ {
168
+ "name": "CONDITIONING",
169
+ "type": "CONDITIONING",
170
+ "links": [
171
+ 35
172
+ ],
173
+ "shape": 3,
174
+ "slot_index": 0
175
+ }
176
+ ],
177
+ "properties": {
178
+ "Node name for S&R": "CLIPTextEncode"
179
+ },
180
+ "widgets_values": [
181
+ "portrait,cinematic,wolf ears,white hair"
182
+ ]
183
+ },
184
+ {
185
+ "id": 10,
186
+ "type": "PreviewImage",
187
+ "pos": [
188
+ 1230,
189
+ -160
190
+ ],
191
+ "size": [
192
+ 855.3022058439137,
193
+ 1107.2183523542942
194
+ ],
195
+ "flags": {},
196
+ "order": 12,
197
+ "mode": 0,
198
+ "inputs": [
199
+ {
200
+ "name": "images",
201
+ "type": "IMAGE",
202
+ "link": 10
203
+ }
204
+ ],
205
+ "properties": {
206
+ "Node name for S&R": "PreviewImage"
207
+ }
208
+ },
209
+ {
210
+ "id": 12,
211
+ "type": "LoadImage",
212
+ "pos": [
213
+ -117,
214
+ 336
215
+ ],
216
+ "size": {
217
+ "0": 404.07366943359375,
218
+ "1": 496.2817077636719
219
+ },
220
+ "flags": {},
221
+ "order": 2,
222
+ "mode": 0,
223
+ "outputs": [
224
+ {
225
+ "name": "IMAGE",
226
+ "type": "IMAGE",
227
+ "links": [
228
+ 114
229
+ ],
230
+ "shape": 3,
231
+ "slot_index": 0
232
+ },
233
+ {
234
+ "name": "MASK",
235
+ "type": "MASK",
236
+ "links": null,
237
+ "shape": 3
238
+ }
239
+ ],
240
+ "properties": {
241
+ "Node name for S&R": "LoadImage"
242
+ },
243
+ "widgets_values": [
244
+ "monalisa.png",
245
+ "image"
246
+ ]
247
+ },
248
+ {
249
+ "id": 16,
250
+ "type": "PulidModelLoader",
251
+ "pos": [
252
+ -20,
253
+ 20
254
+ ],
255
+ "size": {
256
+ "0": 304.0072021484375,
257
+ "1": 58
258
+ },
259
+ "flags": {},
260
+ "order": 3,
261
+ "mode": 0,
262
+ "outputs": [
263
+ {
264
+ "name": "PULID",
265
+ "type": "PULID",
266
+ "links": [
267
+ 117
268
+ ],
269
+ "shape": 3,
270
+ "slot_index": 0
271
+ }
272
+ ],
273
+ "properties": {
274
+ "Node name for S&R": "PulidModelLoader"
275
+ },
276
+ "widgets_values": [
277
+ "ip-adapter_pulid_sdxl_fp16.safetensors"
278
+ ]
279
+ },
280
+ {
281
+ "id": 45,
282
+ "type": "LoraLoaderModelOnly",
283
+ "pos": [
284
+ 4,
285
+ -328
286
+ ],
287
+ "size": [
288
+ 267.767924663449,
289
+ 82
290
+ ],
291
+ "flags": {},
292
+ "order": 6,
293
+ "mode": 0,
294
+ "inputs": [
295
+ {
296
+ "name": "model",
297
+ "type": "MODEL",
298
+ "link": 129
299
+ }
300
+ ],
301
+ "outputs": [
302
+ {
303
+ "name": "MODEL",
304
+ "type": "MODEL",
305
+ "links": [
306
+ 131
307
+ ],
308
+ "shape": 3,
309
+ "slot_index": 0
310
+ }
311
+ ],
312
+ "properties": {
313
+ "Node name for S&R": "LoraLoaderModelOnly"
314
+ },
315
+ "widgets_values": [
316
+ "sdxl_lightning_4step_lora.safetensors",
317
+ 1
318
+ ]
319
+ },
320
+ {
321
+ "id": 5,
322
+ "type": "EmptyLatentImage",
323
+ "pos": [
324
+ 350,
325
+ 265
326
+ ],
327
+ "size": {
328
+ "0": 315,
329
+ "1": 106
330
+ },
331
+ "flags": {},
332
+ "order": 4,
333
+ "mode": 0,
334
+ "outputs": [
335
+ {
336
+ "name": "LATENT",
337
+ "type": "LATENT",
338
+ "links": [
339
+ 2
340
+ ],
341
+ "slot_index": 0
342
+ }
343
+ ],
344
+ "properties": {
345
+ "Node name for S&R": "EmptyLatentImage"
346
+ },
347
+ "widgets_values": [
348
+ 768,
349
+ 1024,
350
+ 1
351
+ ]
352
+ },
353
+ {
354
+ "id": 33,
355
+ "type": "ApplyPulid",
356
+ "pos": [
357
+ 350,
358
+ -10
359
+ ],
360
+ "size": {
361
+ "0": 315,
362
+ "1": 210
363
+ },
364
+ "flags": {},
365
+ "order": 9,
366
+ "mode": 0,
367
+ "inputs": [
368
+ {
369
+ "name": "model",
370
+ "type": "MODEL",
371
+ "link": 131
372
+ },
373
+ {
374
+ "name": "pulid",
375
+ "type": "PULID",
376
+ "link": 117
377
+ },
378
+ {
379
+ "name": "eva_clip",
380
+ "type": "EVA_CLIP",
381
+ "link": 81
382
+ },
383
+ {
384
+ "name": "face_analysis",
385
+ "type": "FACEANALYSIS",
386
+ "link": 82
387
+ },
388
+ {
389
+ "name": "image",
390
+ "type": "IMAGE",
391
+ "link": 114
392
+ }
393
+ ],
394
+ "outputs": [
395
+ {
396
+ "name": "MODEL",
397
+ "type": "MODEL",
398
+ "links": [
399
+ 132
400
+ ],
401
+ "shape": 3,
402
+ "slot_index": 0
403
+ }
404
+ ],
405
+ "properties": {
406
+ "Node name for S&R": "ApplyPulid"
407
+ },
408
+ "widgets_values": [
409
+ "fidelity",
410
+ 0.8,
411
+ 0,
412
+ 1
413
+ ]
414
+ },
415
+ {
416
+ "id": 4,
417
+ "type": "CheckpointLoaderSimple",
418
+ "pos": [
419
+ -378,
420
+ -329
421
+ ],
422
+ "size": {
423
+ "0": 319.03692626953125,
424
+ "1": 101.3391342163086
425
+ },
426
+ "flags": {},
427
+ "order": 5,
428
+ "mode": 0,
429
+ "outputs": [
430
+ {
431
+ "name": "MODEL",
432
+ "type": "MODEL",
433
+ "links": [
434
+ 129
435
+ ],
436
+ "slot_index": 0
437
+ },
438
+ {
439
+ "name": "CLIP",
440
+ "type": "CLIP",
441
+ "links": [
442
+ 93,
443
+ 94
444
+ ],
445
+ "slot_index": 1
446
+ },
447
+ {
448
+ "name": "VAE",
449
+ "type": "VAE",
450
+ "links": [
451
+ 8
452
+ ],
453
+ "slot_index": 2
454
+ }
455
+ ],
456
+ "properties": {
457
+ "Node name for S&R": "CheckpointLoaderSimple"
458
+ },
459
+ "widgets_values": [
460
+ "sdxl/juggernautXL_version8Rundiffusion.safetensors"
461
+ ]
462
+ },
463
+ {
464
+ "id": 3,
465
+ "type": "KSampler",
466
+ "pos": [
467
+ 800,
468
+ -270
469
+ ],
470
+ "size": {
471
+ "0": 341.2750244140625,
472
+ "1": 262
473
+ },
474
+ "flags": {},
475
+ "order": 10,
476
+ "mode": 0,
477
+ "inputs": [
478
+ {
479
+ "name": "model",
480
+ "type": "MODEL",
481
+ "link": 132
482
+ },
483
+ {
484
+ "name": "positive",
485
+ "type": "CONDITIONING",
486
+ "link": 35
487
+ },
488
+ {
489
+ "name": "negative",
490
+ "type": "CONDITIONING",
491
+ "link": 34
492
+ },
493
+ {
494
+ "name": "latent_image",
495
+ "type": "LATENT",
496
+ "link": 2
497
+ }
498
+ ],
499
+ "outputs": [
500
+ {
501
+ "name": "LATENT",
502
+ "type": "LATENT",
503
+ "links": [
504
+ 7
505
+ ],
506
+ "slot_index": 0
507
+ }
508
+ ],
509
+ "properties": {
510
+ "Node name for S&R": "KSampler"
511
+ },
512
+ "widgets_values": [
513
+ 42,
514
+ "fixed",
515
+ 4,
516
+ 1.2,
517
+ "dpmpp_2m",
518
+ "sgm_uniform",
519
+ 1
520
+ ]
521
+ }
522
+ ],
523
+ "links": [
524
+ [
525
+ 2,
526
+ 5,
527
+ 0,
528
+ 3,
529
+ 3,
530
+ "LATENT"
531
+ ],
532
+ [
533
+ 7,
534
+ 3,
535
+ 0,
536
+ 8,
537
+ 0,
538
+ "LATENT"
539
+ ],
540
+ [
541
+ 8,
542
+ 4,
543
+ 2,
544
+ 8,
545
+ 1,
546
+ "VAE"
547
+ ],
548
+ [
549
+ 10,
550
+ 8,
551
+ 0,
552
+ 10,
553
+ 0,
554
+ "IMAGE"
555
+ ],
556
+ [
557
+ 34,
558
+ 23,
559
+ 0,
560
+ 3,
561
+ 2,
562
+ "CONDITIONING"
563
+ ],
564
+ [
565
+ 35,
566
+ 22,
567
+ 0,
568
+ 3,
569
+ 1,
570
+ "CONDITIONING"
571
+ ],
572
+ [
573
+ 81,
574
+ 19,
575
+ 0,
576
+ 33,
577
+ 2,
578
+ "EVA_CLIP"
579
+ ],
580
+ [
581
+ 82,
582
+ 17,
583
+ 0,
584
+ 33,
585
+ 3,
586
+ "FACEANALYSIS"
587
+ ],
588
+ [
589
+ 93,
590
+ 4,
591
+ 1,
592
+ 22,
593
+ 0,
594
+ "CLIP"
595
+ ],
596
+ [
597
+ 94,
598
+ 4,
599
+ 1,
600
+ 23,
601
+ 0,
602
+ "CLIP"
603
+ ],
604
+ [
605
+ 114,
606
+ 12,
607
+ 0,
608
+ 33,
609
+ 4,
610
+ "IMAGE"
611
+ ],
612
+ [
613
+ 117,
614
+ 16,
615
+ 0,
616
+ 33,
617
+ 1,
618
+ "PULID"
619
+ ],
620
+ [
621
+ 129,
622
+ 4,
623
+ 0,
624
+ 45,
625
+ 0,
626
+ "MODEL"
627
+ ],
628
+ [
629
+ 131,
630
+ 45,
631
+ 0,
632
+ 33,
633
+ 0,
634
+ "MODEL"
635
+ ],
636
+ [
637
+ 132,
638
+ 33,
639
+ 0,
640
+ 3,
641
+ 0,
642
+ "MODEL"
643
+ ]
644
+ ],
645
+ "groups": [],
646
+ "config": {},
647
+ "extra": {},
648
+ "version": 0.4
649
+ }
PuLID_ComfyUI/examples/PuLID_simple.json ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 45,
3
+ "last_link_id": 133,
4
+ "nodes": [
5
+ {
6
+ "id": 8,
7
+ "type": "VAEDecode",
8
+ "pos": [
9
+ 1210,
10
+ -270
11
+ ],
12
+ "size": {
13
+ "0": 140,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 10,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "samples",
22
+ "type": "LATENT",
23
+ "link": 7
24
+ },
25
+ {
26
+ "name": "vae",
27
+ "type": "VAE",
28
+ "link": 8
29
+ }
30
+ ],
31
+ "outputs": [
32
+ {
33
+ "name": "IMAGE",
34
+ "type": "IMAGE",
35
+ "links": [
36
+ 10
37
+ ],
38
+ "slot_index": 0
39
+ }
40
+ ],
41
+ "properties": {
42
+ "Node name for S&R": "VAEDecode"
43
+ }
44
+ },
45
+ {
46
+ "id": 19,
47
+ "type": "PulidEvaClipLoader",
48
+ "pos": [
49
+ 130,
50
+ 120
51
+ ],
52
+ "size": {
53
+ "0": 140,
54
+ "1": 26
55
+ },
56
+ "flags": {},
57
+ "order": 0,
58
+ "mode": 0,
59
+ "outputs": [
60
+ {
61
+ "name": "EVA_CLIP",
62
+ "type": "EVA_CLIP",
63
+ "links": [
64
+ 81
65
+ ],
66
+ "shape": 3,
67
+ "slot_index": 0
68
+ }
69
+ ],
70
+ "properties": {
71
+ "Node name for S&R": "PulidEvaClipLoader"
72
+ }
73
+ },
74
+ {
75
+ "id": 17,
76
+ "type": "PulidInsightFaceLoader",
77
+ "pos": [
78
+ 60,
79
+ 190
80
+ ],
81
+ "size": {
82
+ "0": 210,
83
+ "1": 58
84
+ },
85
+ "flags": {},
86
+ "order": 1,
87
+ "mode": 0,
88
+ "outputs": [
89
+ {
90
+ "name": "FACEANALYSIS",
91
+ "type": "FACEANALYSIS",
92
+ "links": [
93
+ 82
94
+ ],
95
+ "shape": 3,
96
+ "slot_index": 0
97
+ }
98
+ ],
99
+ "properties": {
100
+ "Node name for S&R": "PulidInsightFaceLoader"
101
+ },
102
+ "widgets_values": [
103
+ "CPU"
104
+ ]
105
+ },
106
+ {
107
+ "id": 16,
108
+ "type": "PulidModelLoader",
109
+ "pos": [
110
+ -20,
111
+ 20
112
+ ],
113
+ "size": {
114
+ "0": 304.0072021484375,
115
+ "1": 58
116
+ },
117
+ "flags": {},
118
+ "order": 2,
119
+ "mode": 0,
120
+ "outputs": [
121
+ {
122
+ "name": "PULID",
123
+ "type": "PULID",
124
+ "links": [
125
+ 117
126
+ ],
127
+ "shape": 3,
128
+ "slot_index": 0
129
+ }
130
+ ],
131
+ "properties": {
132
+ "Node name for S&R": "PulidModelLoader"
133
+ },
134
+ "widgets_values": [
135
+ "ip-adapter_pulid_sdxl_fp16.safetensors"
136
+ ]
137
+ },
138
+ {
139
+ "id": 5,
140
+ "type": "EmptyLatentImage",
141
+ "pos": [
142
+ 350,
143
+ 265
144
+ ],
145
+ "size": {
146
+ "0": 315,
147
+ "1": 106
148
+ },
149
+ "flags": {},
150
+ "order": 3,
151
+ "mode": 0,
152
+ "outputs": [
153
+ {
154
+ "name": "LATENT",
155
+ "type": "LATENT",
156
+ "links": [
157
+ 2
158
+ ],
159
+ "slot_index": 0
160
+ }
161
+ ],
162
+ "properties": {
163
+ "Node name for S&R": "EmptyLatentImage"
164
+ },
165
+ "widgets_values": [
166
+ 768,
167
+ 1024,
168
+ 1
169
+ ]
170
+ },
171
+ {
172
+ "id": 33,
173
+ "type": "ApplyPulid",
174
+ "pos": [
175
+ 350,
176
+ -10
177
+ ],
178
+ "size": {
179
+ "0": 315,
180
+ "1": 210
181
+ },
182
+ "flags": {},
183
+ "order": 6,
184
+ "mode": 0,
185
+ "inputs": [
186
+ {
187
+ "name": "model",
188
+ "type": "MODEL",
189
+ "link": 133
190
+ },
191
+ {
192
+ "name": "pulid",
193
+ "type": "PULID",
194
+ "link": 117
195
+ },
196
+ {
197
+ "name": "eva_clip",
198
+ "type": "EVA_CLIP",
199
+ "link": 81
200
+ },
201
+ {
202
+ "name": "face_analysis",
203
+ "type": "FACEANALYSIS",
204
+ "link": 82
205
+ },
206
+ {
207
+ "name": "image",
208
+ "type": "IMAGE",
209
+ "link": 114
210
+ }
211
+ ],
212
+ "outputs": [
213
+ {
214
+ "name": "MODEL",
215
+ "type": "MODEL",
216
+ "links": [
217
+ 132
218
+ ],
219
+ "shape": 3,
220
+ "slot_index": 0
221
+ }
222
+ ],
223
+ "properties": {
224
+ "Node name for S&R": "ApplyPulid"
225
+ },
226
+ "widgets_values": [
227
+ "fidelity",
228
+ 0.8,
229
+ 0,
230
+ 1
231
+ ]
232
+ },
233
+ {
234
+ "id": 23,
235
+ "type": "CLIPTextEncode",
236
+ "pos": [
237
+ 330,
238
+ -260
239
+ ],
240
+ "size": {
241
+ "0": 334.8077697753906,
242
+ "1": 189.35675048828125
243
+ },
244
+ "flags": {},
245
+ "order": 8,
246
+ "mode": 0,
247
+ "inputs": [
248
+ {
249
+ "name": "clip",
250
+ "type": "CLIP",
251
+ "link": 94
252
+ }
253
+ ],
254
+ "outputs": [
255
+ {
256
+ "name": "CONDITIONING",
257
+ "type": "CONDITIONING",
258
+ "links": [
259
+ 34
260
+ ],
261
+ "shape": 3,
262
+ "slot_index": 0
263
+ }
264
+ ],
265
+ "properties": {
266
+ "Node name for S&R": "CLIPTextEncode"
267
+ },
268
+ "widgets_values": [
269
+ "blurry, malformed, low quality, worst quality, artifacts, noise, text, watermark, glitch, deformed, ugly, horror, ill"
270
+ ]
271
+ },
272
+ {
273
+ "id": 22,
274
+ "type": "CLIPTextEncode",
275
+ "pos": [
276
+ 340,
277
+ -430
278
+ ],
279
+ "size": {
280
+ "0": 315.23089599609375,
281
+ "1": 113.96450805664062
282
+ },
283
+ "flags": {},
284
+ "order": 7,
285
+ "mode": 0,
286
+ "inputs": [
287
+ {
288
+ "name": "clip",
289
+ "type": "CLIP",
290
+ "link": 93
291
+ }
292
+ ],
293
+ "outputs": [
294
+ {
295
+ "name": "CONDITIONING",
296
+ "type": "CONDITIONING",
297
+ "links": [
298
+ 35
299
+ ],
300
+ "shape": 3,
301
+ "slot_index": 0
302
+ }
303
+ ],
304
+ "properties": {
305
+ "Node name for S&R": "CLIPTextEncode"
306
+ },
307
+ "widgets_values": [
308
+ "closeup portrait, cyberpunk, cinematic, hoodie, purple hair, highly detailed, 4k, high resolution"
309
+ ]
310
+ },
311
+ {
312
+ "id": 3,
313
+ "type": "KSampler",
314
+ "pos": [
315
+ 800,
316
+ -270
317
+ ],
318
+ "size": {
319
+ "0": 341.2750244140625,
320
+ "1": 262
321
+ },
322
+ "flags": {},
323
+ "order": 9,
324
+ "mode": 0,
325
+ "inputs": [
326
+ {
327
+ "name": "model",
328
+ "type": "MODEL",
329
+ "link": 132
330
+ },
331
+ {
332
+ "name": "positive",
333
+ "type": "CONDITIONING",
334
+ "link": 35
335
+ },
336
+ {
337
+ "name": "negative",
338
+ "type": "CONDITIONING",
339
+ "link": 34
340
+ },
341
+ {
342
+ "name": "latent_image",
343
+ "type": "LATENT",
344
+ "link": 2
345
+ }
346
+ ],
347
+ "outputs": [
348
+ {
349
+ "name": "LATENT",
350
+ "type": "LATENT",
351
+ "links": [
352
+ 7
353
+ ],
354
+ "slot_index": 0
355
+ }
356
+ ],
357
+ "properties": {
358
+ "Node name for S&R": "KSampler"
359
+ },
360
+ "widgets_values": [
361
+ 51,
362
+ "fixed",
363
+ 30,
364
+ 6,
365
+ "dpmpp_2m",
366
+ "sgm_uniform",
367
+ 1
368
+ ]
369
+ },
370
+ {
371
+ "id": 12,
372
+ "type": "LoadImage",
373
+ "pos": [
374
+ -115,
375
+ 310
376
+ ],
377
+ "size": {
378
+ "0": 404.07366943359375,
379
+ "1": 496.2817077636719
380
+ },
381
+ "flags": {},
382
+ "order": 4,
383
+ "mode": 0,
384
+ "outputs": [
385
+ {
386
+ "name": "IMAGE",
387
+ "type": "IMAGE",
388
+ "links": [
389
+ 114
390
+ ],
391
+ "shape": 3,
392
+ "slot_index": 0
393
+ },
394
+ {
395
+ "name": "MASK",
396
+ "type": "MASK",
397
+ "links": null,
398
+ "shape": 3
399
+ }
400
+ ],
401
+ "properties": {
402
+ "Node name for S&R": "LoadImage"
403
+ },
404
+ "widgets_values": [
405
+ "monalisa.png",
406
+ "image"
407
+ ]
408
+ },
409
+ {
410
+ "id": 4,
411
+ "type": "CheckpointLoaderSimple",
412
+ "pos": [
413
+ -97,
414
+ -265
415
+ ],
416
+ "size": {
417
+ "0": 319.03692626953125,
418
+ "1": 101.3391342163086
419
+ },
420
+ "flags": {},
421
+ "order": 5,
422
+ "mode": 0,
423
+ "outputs": [
424
+ {
425
+ "name": "MODEL",
426
+ "type": "MODEL",
427
+ "links": [
428
+ 133
429
+ ],
430
+ "slot_index": 0
431
+ },
432
+ {
433
+ "name": "CLIP",
434
+ "type": "CLIP",
435
+ "links": [
436
+ 93,
437
+ 94
438
+ ],
439
+ "slot_index": 1
440
+ },
441
+ {
442
+ "name": "VAE",
443
+ "type": "VAE",
444
+ "links": [
445
+ 8
446
+ ],
447
+ "slot_index": 2
448
+ }
449
+ ],
450
+ "properties": {
451
+ "Node name for S&R": "CheckpointLoaderSimple"
452
+ },
453
+ "widgets_values": [
454
+ "sdxl/Proteus-RunDiffusion.safetensors"
455
+ ]
456
+ },
457
+ {
458
+ "id": 10,
459
+ "type": "PreviewImage",
460
+ "pos": [
461
+ 1181,
462
+ -162
463
+ ],
464
+ "size": [
465
+ 705.6038401281248,
466
+ 950.4616015812499
467
+ ],
468
+ "flags": {},
469
+ "order": 11,
470
+ "mode": 0,
471
+ "inputs": [
472
+ {
473
+ "name": "images",
474
+ "type": "IMAGE",
475
+ "link": 10
476
+ }
477
+ ],
478
+ "properties": {
479
+ "Node name for S&R": "PreviewImage"
480
+ }
481
+ }
482
+ ],
483
+ "links": [
484
+ [
485
+ 2,
486
+ 5,
487
+ 0,
488
+ 3,
489
+ 3,
490
+ "LATENT"
491
+ ],
492
+ [
493
+ 7,
494
+ 3,
495
+ 0,
496
+ 8,
497
+ 0,
498
+ "LATENT"
499
+ ],
500
+ [
501
+ 8,
502
+ 4,
503
+ 2,
504
+ 8,
505
+ 1,
506
+ "VAE"
507
+ ],
508
+ [
509
+ 10,
510
+ 8,
511
+ 0,
512
+ 10,
513
+ 0,
514
+ "IMAGE"
515
+ ],
516
+ [
517
+ 34,
518
+ 23,
519
+ 0,
520
+ 3,
521
+ 2,
522
+ "CONDITIONING"
523
+ ],
524
+ [
525
+ 35,
526
+ 22,
527
+ 0,
528
+ 3,
529
+ 1,
530
+ "CONDITIONING"
531
+ ],
532
+ [
533
+ 81,
534
+ 19,
535
+ 0,
536
+ 33,
537
+ 2,
538
+ "EVA_CLIP"
539
+ ],
540
+ [
541
+ 82,
542
+ 17,
543
+ 0,
544
+ 33,
545
+ 3,
546
+ "FACEANALYSIS"
547
+ ],
548
+ [
549
+ 93,
550
+ 4,
551
+ 1,
552
+ 22,
553
+ 0,
554
+ "CLIP"
555
+ ],
556
+ [
557
+ 94,
558
+ 4,
559
+ 1,
560
+ 23,
561
+ 0,
562
+ "CLIP"
563
+ ],
564
+ [
565
+ 114,
566
+ 12,
567
+ 0,
568
+ 33,
569
+ 4,
570
+ "IMAGE"
571
+ ],
572
+ [
573
+ 117,
574
+ 16,
575
+ 0,
576
+ 33,
577
+ 1,
578
+ "PULID"
579
+ ],
580
+ [
581
+ 132,
582
+ 33,
583
+ 0,
584
+ 3,
585
+ 0,
586
+ "MODEL"
587
+ ],
588
+ [
589
+ 133,
590
+ 4,
591
+ 0,
592
+ 33,
593
+ 0,
594
+ "MODEL"
595
+ ]
596
+ ],
597
+ "groups": [],
598
+ "config": {},
599
+ "extra": {},
600
+ "version": 0.4
601
+ }
PuLID_ComfyUI/examples/pulid_wf.jpg ADDED

Git LFS Details

  • SHA256: bb945f14a747a03cfbacf20d3ef3be2f3d9b1b60757ce542f35e21ac8d922180
  • Pointer size: 131 Bytes
  • Size of remote file: 133 kB
PuLID_ComfyUI/pulid.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torchvision.transforms as T
4
+ import torch.nn.functional as F
5
+ import os
6
+ import math
7
+ import folder_paths
8
+ import comfy.utils
9
+ from insightface.app import FaceAnalysis
10
+ from facexlib.parsing import init_parsing_model
11
+ from facexlib.utils.face_restoration_helper import FaceRestoreHelper
12
+ from comfy.ldm.modules.attention import optimized_attention
13
+
14
+ from .eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
15
+
16
+ from .encoders import IDEncoder
17
+
18
+ INSIGHTFACE_DIR = os.path.join(folder_paths.models_dir, "insightface")
19
+
20
+ MODELS_DIR = os.path.join(folder_paths.models_dir, "pulid")
21
+ if "pulid" not in folder_paths.folder_names_and_paths:
22
+ current_paths = [MODELS_DIR]
23
+ else:
24
+ current_paths, _ = folder_paths.folder_names_and_paths["pulid"]
25
+ folder_paths.folder_names_and_paths["pulid"] = (current_paths, folder_paths.supported_pt_extensions)
26
+
27
+ class PulidModel(nn.Module):
28
+ def __init__(self, model):
29
+ super().__init__()
30
+
31
+ self.model = model
32
+ self.image_proj_model = self.init_id_adapter()
33
+ self.image_proj_model.load_state_dict(model["image_proj"])
34
+ self.ip_layers = To_KV(model["ip_adapter"])
35
+
36
+ def init_id_adapter(self):
37
+ image_proj_model = IDEncoder()
38
+ return image_proj_model
39
+
40
+ def get_image_embeds(self, face_embed, clip_embeds):
41
+ embeds = self.image_proj_model(face_embed, clip_embeds)
42
+ return embeds
43
+
44
+ class To_KV(nn.Module):
45
+ def __init__(self, state_dict):
46
+ super().__init__()
47
+
48
+ self.to_kvs = nn.ModuleDict()
49
+ for key, value in state_dict.items():
50
+ self.to_kvs[key.replace(".weight", "").replace(".", "_")] = nn.Linear(value.shape[1], value.shape[0], bias=False)
51
+ self.to_kvs[key.replace(".weight", "").replace(".", "_")].weight.data = value
52
+
53
+ def tensor_to_image(tensor):
54
+ image = tensor.mul(255).clamp(0, 255).byte().cpu()
55
+ image = image[..., [2, 1, 0]].numpy()
56
+ return image
57
+
58
+ def image_to_tensor(image):
59
+ tensor = torch.clamp(torch.from_numpy(image).float() / 255., 0, 1)
60
+ tensor = tensor[..., [2, 1, 0]]
61
+ return tensor
62
+
63
+ def tensor_to_size(source, dest_size):
64
+ if isinstance(dest_size, torch.Tensor):
65
+ dest_size = dest_size.shape[0]
66
+ source_size = source.shape[0]
67
+
68
+ if source_size < dest_size:
69
+ shape = [dest_size - source_size] + [1]*(source.dim()-1)
70
+ source = torch.cat((source, source[-1:].repeat(shape)), dim=0)
71
+ elif source_size > dest_size:
72
+ source = source[:dest_size]
73
+
74
+ return source
75
+
76
+ def set_model_patch_replace(model, patch_kwargs, key):
77
+ to = model.model_options["transformer_options"].copy()
78
+ if "patches_replace" not in to:
79
+ to["patches_replace"] = {}
80
+ else:
81
+ to["patches_replace"] = to["patches_replace"].copy()
82
+
83
+ if "attn2" not in to["patches_replace"]:
84
+ to["patches_replace"]["attn2"] = {}
85
+ else:
86
+ to["patches_replace"]["attn2"] = to["patches_replace"]["attn2"].copy()
87
+
88
+ if key not in to["patches_replace"]["attn2"]:
89
+ to["patches_replace"]["attn2"][key] = Attn2Replace(pulid_attention, **patch_kwargs)
90
+ model.model_options["transformer_options"] = to
91
+ else:
92
+ to["patches_replace"]["attn2"][key].add(pulid_attention, **patch_kwargs)
93
+
94
+ class Attn2Replace:
95
+ def __init__(self, callback=None, **kwargs):
96
+ self.callback = [callback]
97
+ self.kwargs = [kwargs]
98
+
99
+ def add(self, callback, **kwargs):
100
+ self.callback.append(callback)
101
+ self.kwargs.append(kwargs)
102
+
103
+ for key, value in kwargs.items():
104
+ setattr(self, key, value)
105
+
106
+ def __call__(self, q, k, v, extra_options):
107
+ dtype = q.dtype
108
+ out = optimized_attention(q, k, v, extra_options["n_heads"])
109
+ sigma = extra_options["sigmas"].detach().cpu()[0].item() if 'sigmas' in extra_options else 999999999.9
110
+
111
+ for i, callback in enumerate(self.callback):
112
+ if sigma <= self.kwargs[i]["sigma_start"] and sigma >= self.kwargs[i]["sigma_end"]:
113
+ out = out + callback(out, q, k, v, extra_options, **self.kwargs[i])
114
+
115
+ return out.to(dtype=dtype)
116
+
117
+ def pulid_attention(out, q, k, v, extra_options, module_key='', pulid=None, cond=None, uncond=None, weight=1.0, ortho=False, ortho_v2=False, mask=None, **kwargs):
118
+ k_key = module_key + "_to_k_ip"
119
+ v_key = module_key + "_to_v_ip"
120
+
121
+ dtype = q.dtype
122
+ seq_len = q.shape[1]
123
+ cond_or_uncond = extra_options["cond_or_uncond"]
124
+ b = q.shape[0]
125
+ batch_prompt = b // len(cond_or_uncond)
126
+ _, _, oh, ow = extra_options["original_shape"]
127
+
128
+ #conds = torch.cat([uncond.repeat(batch_prompt, 1, 1), cond.repeat(batch_prompt, 1, 1)], dim=0)
129
+ #zero_tensor = torch.zeros((conds.size(0), num_zero, conds.size(-1)), dtype=conds.dtype, device=conds.device)
130
+ #conds = torch.cat([conds, zero_tensor], dim=1)
131
+ #ip_k = pulid.ip_layers.to_kvs[k_key](conds)
132
+ #ip_v = pulid.ip_layers.to_kvs[v_key](conds)
133
+
134
+ k_cond = pulid.ip_layers.to_kvs[k_key](cond).repeat(batch_prompt, 1, 1)
135
+ k_uncond = pulid.ip_layers.to_kvs[k_key](uncond).repeat(batch_prompt, 1, 1)
136
+ v_cond = pulid.ip_layers.to_kvs[v_key](cond).repeat(batch_prompt, 1, 1)
137
+ v_uncond = pulid.ip_layers.to_kvs[v_key](uncond).repeat(batch_prompt, 1, 1)
138
+ ip_k = torch.cat([(k_cond, k_uncond)[i] for i in cond_or_uncond], dim=0)
139
+ ip_v = torch.cat([(v_cond, v_uncond)[i] for i in cond_or_uncond], dim=0)
140
+
141
+ out_ip = optimized_attention(q, ip_k, ip_v, extra_options["n_heads"])
142
+
143
+ if ortho:
144
+ out = out.to(dtype=torch.float32)
145
+ out_ip = out_ip.to(dtype=torch.float32)
146
+ projection = (torch.sum((out * out_ip), dim=-2, keepdim=True) / torch.sum((out * out), dim=-2, keepdim=True) * out)
147
+ orthogonal = out_ip - projection
148
+ out_ip = weight * orthogonal
149
+ elif ortho_v2:
150
+ out = out.to(dtype=torch.float32)
151
+ out_ip = out_ip.to(dtype=torch.float32)
152
+ attn_map = q @ ip_k.transpose(-2, -1)
153
+ attn_mean = attn_map.softmax(dim=-1).mean(dim=1, keepdim=True)
154
+ attn_mean = attn_mean[:, :, :5].sum(dim=-1, keepdim=True)
155
+ projection = (torch.sum((out * out_ip), dim=-2, keepdim=True) / torch.sum((out * out), dim=-2, keepdim=True) * out)
156
+ orthogonal = out_ip + (attn_mean - 1) * projection
157
+ out_ip = weight * orthogonal
158
+ else:
159
+ out_ip = out_ip * weight
160
+
161
+ if mask is not None:
162
+ mask_h = oh / math.sqrt(oh * ow / seq_len)
163
+ mask_h = int(mask_h) + int((seq_len % int(mask_h)) != 0)
164
+ mask_w = seq_len // mask_h
165
+
166
+ mask = F.interpolate(mask.unsqueeze(1), size=(mask_h, mask_w), mode="bilinear").squeeze(1)
167
+ mask = tensor_to_size(mask, batch_prompt)
168
+
169
+ mask = mask.repeat(len(cond_or_uncond), 1, 1)
170
+ mask = mask.view(mask.shape[0], -1, 1).repeat(1, 1, out.shape[2])
171
+
172
+ # covers cases where extreme aspect ratios can cause the mask to have a wrong size
173
+ mask_len = mask_h * mask_w
174
+ if mask_len < seq_len:
175
+ pad_len = seq_len - mask_len
176
+ pad1 = pad_len // 2
177
+ pad2 = pad_len - pad1
178
+ mask = F.pad(mask, (0, 0, pad1, pad2), value=0.0)
179
+ elif mask_len > seq_len:
180
+ crop_start = (mask_len - seq_len) // 2
181
+ mask = mask[:, crop_start:crop_start+seq_len, :]
182
+
183
+ out_ip = out_ip * mask
184
+
185
+ return out_ip.to(dtype=dtype)
186
+
187
+ def to_gray(img):
188
+ x = 0.299 * img[:, 0:1] + 0.587 * img[:, 1:2] + 0.114 * img[:, 2:3]
189
+ x = x.repeat(1, 3, 1, 1)
190
+ return x
191
+
192
+ """
193
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
194
+ Nodes
195
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
196
+ """
197
+
198
+ class PulidModelLoader:
199
+ @classmethod
200
+ def INPUT_TYPES(s):
201
+ return {"required": { "pulid_file": (folder_paths.get_filename_list("pulid"), )}}
202
+
203
+ RETURN_TYPES = ("PULID",)
204
+ FUNCTION = "load_model"
205
+ CATEGORY = "pulid"
206
+
207
+ def load_model(self, pulid_file):
208
+ ckpt_path = folder_paths.get_full_path("pulid", pulid_file)
209
+
210
+ model = comfy.utils.load_torch_file(ckpt_path, safe_load=True)
211
+
212
+ if ckpt_path.lower().endswith(".safetensors"):
213
+ st_model = {"image_proj": {}, "ip_adapter": {}}
214
+ for key in model.keys():
215
+ if key.startswith("image_proj."):
216
+ st_model["image_proj"][key.replace("image_proj.", "")] = model[key]
217
+ elif key.startswith("ip_adapter."):
218
+ st_model["ip_adapter"][key.replace("ip_adapter.", "")] = model[key]
219
+ model = st_model
220
+
221
+ # Also initialize the model, takes longer to load but then it doesn't have to be done every time you change parameters in the apply node
222
+ model = PulidModel(model)
223
+
224
+ return (model,)
225
+
226
+ class PulidInsightFaceLoader:
227
+ @classmethod
228
+ def INPUT_TYPES(s):
229
+ return {
230
+ "required": {
231
+ "provider": (["CPU", "CUDA", "ROCM"], ),
232
+ },
233
+ }
234
+
235
+ RETURN_TYPES = ("FACEANALYSIS",)
236
+ FUNCTION = "load_insightface"
237
+ CATEGORY = "pulid"
238
+
239
+ def load_insightface(self, provider):
240
+ model = FaceAnalysis(name="antelopev2", root=INSIGHTFACE_DIR, providers=[provider + 'ExecutionProvider',]) # alternative to buffalo_l
241
+ model.prepare(ctx_id=0, det_size=(640, 640))
242
+
243
+ return (model,)
244
+
245
+ class PulidEvaClipLoader:
246
+ @classmethod
247
+ def INPUT_TYPES(s):
248
+ return {
249
+ "required": {},
250
+ }
251
+
252
+ RETURN_TYPES = ("EVA_CLIP",)
253
+ FUNCTION = "load_eva_clip"
254
+ CATEGORY = "pulid"
255
+
256
+ def load_eva_clip(self):
257
+ from .eva_clip.factory import create_model_and_transforms
258
+
259
+ model, _, _ = create_model_and_transforms('EVA02-CLIP-L-14-336', 'eva_clip', force_custom_clip=True)
260
+
261
+ model = model.visual
262
+
263
+ eva_transform_mean = getattr(model, 'image_mean', OPENAI_DATASET_MEAN)
264
+ eva_transform_std = getattr(model, 'image_std', OPENAI_DATASET_STD)
265
+ if not isinstance(eva_transform_mean, (list, tuple)):
266
+ model["image_mean"] = (eva_transform_mean,) * 3
267
+ if not isinstance(eva_transform_std, (list, tuple)):
268
+ model["image_std"] = (eva_transform_std,) * 3
269
+
270
+ return (model,)
271
+
272
+
273
+ class ApplyPulid:
274
+ @classmethod
275
+ def INPUT_TYPES(s):
276
+ return {
277
+ "required": {
278
+ "model": ("MODEL", ),
279
+ "pulid": ("PULID", ),
280
+ "eva_clip": ("EVA_CLIP", ),
281
+ "face_analysis": ("FACEANALYSIS", ),
282
+ "image": ("IMAGE", ),
283
+ "method": (["fidelity", "style", "neutral"],),
284
+ "weight": ("FLOAT", {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05 }),
285
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
286
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
287
+ },
288
+ "optional": {
289
+ "attn_mask": ("MASK", ),
290
+ },
291
+ }
292
+
293
+ RETURN_TYPES = ("MODEL",)
294
+ FUNCTION = "apply_pulid"
295
+ CATEGORY = "pulid"
296
+
297
+ def apply_pulid(self, model, pulid, eva_clip, face_analysis, image, weight, start_at, end_at, method=None, noise=0.0, fidelity=None, projection=None, attn_mask=None):
298
+ work_model = model.clone()
299
+
300
+ device = comfy.model_management.get_torch_device()
301
+ dtype = comfy.model_management.unet_dtype()
302
+ if dtype not in [torch.float32, torch.float16, torch.bfloat16]:
303
+ dtype = torch.float16 if comfy.model_management.should_use_fp16() else torch.float32
304
+
305
+ eva_clip.to(device, dtype=dtype)
306
+ pulid_model = pulid.to(device, dtype=dtype)
307
+
308
+ if attn_mask is not None:
309
+ if attn_mask.dim() > 3:
310
+ attn_mask = attn_mask.squeeze(-1)
311
+ elif attn_mask.dim() < 3:
312
+ attn_mask = attn_mask.unsqueeze(0)
313
+ attn_mask = attn_mask.to(device, dtype=dtype)
314
+
315
+ if method == "fidelity" or projection == "ortho_v2":
316
+ num_zero = 8
317
+ ortho = False
318
+ ortho_v2 = True
319
+ elif method == "style" or projection == "ortho":
320
+ num_zero = 16
321
+ ortho = True
322
+ ortho_v2 = False
323
+ else:
324
+ num_zero = 0
325
+ ortho = False
326
+ ortho_v2 = False
327
+
328
+ if fidelity is not None:
329
+ num_zero = fidelity
330
+
331
+ #face_analysis.det_model.input_size = (640,640)
332
+ image = tensor_to_image(image)
333
+
334
+ face_helper = FaceRestoreHelper(
335
+ upscale_factor=1,
336
+ face_size=512,
337
+ crop_ratio=(1, 1),
338
+ det_model='retinaface_resnet50',
339
+ save_ext='png',
340
+ device=device,
341
+ )
342
+
343
+ face_helper.face_parse = None
344
+ face_helper.face_parse = init_parsing_model(model_name='bisenet', device=device)
345
+
346
+ bg_label = [0, 16, 18, 7, 8, 9, 14, 15]
347
+ cond = []
348
+ uncond = []
349
+
350
+ for i in range(image.shape[0]):
351
+ # get insightface embeddings
352
+ iface_embeds = None
353
+ for size in [(size, size) for size in range(640, 256, -64)]:
354
+ face_analysis.det_model.input_size = size
355
+ face = face_analysis.get(image[i])
356
+ if face:
357
+ face = sorted(face, key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), reverse=True)[-1]
358
+ iface_embeds = torch.from_numpy(face.embedding).unsqueeze(0).to(device, dtype=dtype)
359
+ break
360
+ else:
361
+ raise Exception('insightface: No face detected.')
362
+
363
+ # get eva_clip embeddings
364
+ face_helper.clean_all()
365
+ face_helper.read_image(image[i])
366
+ face_helper.get_face_landmarks_5(only_center_face=True)
367
+ face_helper.align_warp_face()
368
+
369
+ if len(face_helper.cropped_faces) == 0:
370
+ raise Exception('facexlib: No face detected.')
371
+
372
+ face = face_helper.cropped_faces[0]
373
+ face = image_to_tensor(face).unsqueeze(0).permute(0,3,1,2).to(device)
374
+ parsing_out = face_helper.face_parse(T.functional.normalize(face, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
375
+ parsing_out = parsing_out.argmax(dim=1, keepdim=True)
376
+ bg = sum(parsing_out == i for i in bg_label).bool()
377
+ white_image = torch.ones_like(face)
378
+ face_features_image = torch.where(bg, white_image, to_gray(face))
379
+ # apparently MPS only supports NEAREST interpolation?
380
+ face_features_image = T.functional.resize(face_features_image, eva_clip.image_size, T.InterpolationMode.BICUBIC if 'cuda' in device.type else T.InterpolationMode.NEAREST).to(device, dtype=dtype)
381
+ face_features_image = T.functional.normalize(face_features_image, eva_clip.image_mean, eva_clip.image_std)
382
+
383
+ id_cond_vit, id_vit_hidden = eva_clip(face_features_image, return_all_features=False, return_hidden=True, shuffle=False)
384
+ id_cond_vit = id_cond_vit.to(device, dtype=dtype)
385
+ for idx in range(len(id_vit_hidden)):
386
+ id_vit_hidden[idx] = id_vit_hidden[idx].to(device, dtype=dtype)
387
+
388
+ id_cond_vit = torch.div(id_cond_vit, torch.norm(id_cond_vit, 2, 1, True))
389
+
390
+ # combine embeddings
391
+ id_cond = torch.cat([iface_embeds, id_cond_vit], dim=-1)
392
+ if noise == 0:
393
+ id_uncond = torch.zeros_like(id_cond)
394
+ else:
395
+ id_uncond = torch.rand_like(id_cond) * noise
396
+ id_vit_hidden_uncond = []
397
+ for idx in range(len(id_vit_hidden)):
398
+ if noise == 0:
399
+ id_vit_hidden_uncond.append(torch.zeros_like(id_vit_hidden[idx]))
400
+ else:
401
+ id_vit_hidden_uncond.append(torch.rand_like(id_vit_hidden[idx]) * noise)
402
+
403
+ cond.append(pulid_model.get_image_embeds(id_cond, id_vit_hidden))
404
+ uncond.append(pulid_model.get_image_embeds(id_uncond, id_vit_hidden_uncond))
405
+
406
+ # average embeddings
407
+ cond = torch.cat(cond).to(device, dtype=dtype)
408
+ uncond = torch.cat(uncond).to(device, dtype=dtype)
409
+ if cond.shape[0] > 1:
410
+ cond = torch.mean(cond, dim=0, keepdim=True)
411
+ uncond = torch.mean(uncond, dim=0, keepdim=True)
412
+
413
+ if num_zero > 0:
414
+ if noise == 0:
415
+ zero_tensor = torch.zeros((cond.size(0), num_zero, cond.size(-1)), dtype=dtype, device=device)
416
+ else:
417
+ zero_tensor = torch.rand((cond.size(0), num_zero, cond.size(-1)), dtype=dtype, device=device) * noise
418
+ cond = torch.cat([cond, zero_tensor], dim=1)
419
+ uncond = torch.cat([uncond, zero_tensor], dim=1)
420
+
421
+ sigma_start = work_model.get_model_object("model_sampling").percent_to_sigma(start_at)
422
+ sigma_end = work_model.get_model_object("model_sampling").percent_to_sigma(end_at)
423
+
424
+ patch_kwargs = {
425
+ "pulid": pulid_model,
426
+ "weight": weight,
427
+ "cond": cond,
428
+ "uncond": uncond,
429
+ "sigma_start": sigma_start,
430
+ "sigma_end": sigma_end,
431
+ "ortho": ortho,
432
+ "ortho_v2": ortho_v2,
433
+ "mask": attn_mask,
434
+ }
435
+
436
+ number = 0
437
+ for id in [4,5,7,8]: # id of input_blocks that have cross attention
438
+ block_indices = range(2) if id in [4, 5] else range(10) # transformer_depth
439
+ for index in block_indices:
440
+ patch_kwargs["module_key"] = str(number*2+1)
441
+ set_model_patch_replace(work_model, patch_kwargs, ("input", id, index))
442
+ number += 1
443
+ for id in range(6): # id of output_blocks that have cross attention
444
+ block_indices = range(2) if id in [3, 4, 5] else range(10) # transformer_depth
445
+ for index in block_indices:
446
+ patch_kwargs["module_key"] = str(number*2+1)
447
+ set_model_patch_replace(work_model, patch_kwargs, ("output", id, index))
448
+ number += 1
449
+ for index in range(10):
450
+ patch_kwargs["module_key"] = str(number*2+1)
451
+ set_model_patch_replace(work_model, patch_kwargs, ("middle", 0, index))
452
+ number += 1
453
+
454
+ return (work_model,)
455
+
456
+ class ApplyPulidAdvanced(ApplyPulid):
457
+ @classmethod
458
+ def INPUT_TYPES(s):
459
+ return {
460
+ "required": {
461
+ "model": ("MODEL", ),
462
+ "pulid": ("PULID", ),
463
+ "eva_clip": ("EVA_CLIP", ),
464
+ "face_analysis": ("FACEANALYSIS", ),
465
+ "image": ("IMAGE", ),
466
+ "weight": ("FLOAT", {"default": 1.0, "min": -1.0, "max": 5.0, "step": 0.05 }),
467
+ "projection": (["ortho_v2", "ortho", "none"],),
468
+ "fidelity": ("INT", {"default": 8, "min": 0, "max": 32, "step": 1 }),
469
+ "noise": ("FLOAT", {"default": 0.0, "min": -1.0, "max": 1.0, "step": 0.1 }),
470
+ "start_at": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
471
+ "end_at": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001 }),
472
+ },
473
+ "optional": {
474
+ "attn_mask": ("MASK", ),
475
+ },
476
+ }
477
+
478
+ NODE_CLASS_MAPPINGS = {
479
+ "PulidModelLoader": PulidModelLoader,
480
+ "PulidInsightFaceLoader": PulidInsightFaceLoader,
481
+ "PulidEvaClipLoader": PulidEvaClipLoader,
482
+ "ApplyPulid": ApplyPulid,
483
+ "ApplyPulidAdvanced": ApplyPulidAdvanced,
484
+ }
485
+
486
+ NODE_DISPLAY_NAME_MAPPINGS = {
487
+ "PulidModelLoader": "Load PuLID Model",
488
+ "PulidInsightFaceLoader": "Load InsightFace (PuLID)",
489
+ "PulidEvaClipLoader": "Load Eva Clip (PuLID)",
490
+ "ApplyPulid": "Apply PuLID",
491
+ "ApplyPulidAdvanced": "Apply PuLID Advanced",
492
+ }
PuLID_ComfyUI/pyproject.toml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "pulid_comfyui"
3
+ description = "PuLID ComfyUI native implementation."
4
+ version = "1.0.0"
5
+ license = "LICENSE"
6
+ dependencies = ["facexlib", "insightface", "onnxruntime", "onnxruntime-gpu", "ftfy", "timm"]
7
+
8
+ [project.urls]
9
+ Repository = "https://github.com/cubiq/PuLID_ComfyUI"
10
+ # Used by Comfy Registry https://comfyregistry.org
11
+
12
+ [tool.comfy]
13
+ PublisherId = "matteo"
14
+ DisplayName = "PuLID_ComfyUI"
15
+ Icon = ""
PuLID_ComfyUI/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ facexlib
2
+ insightface
3
+ onnxruntime
4
+ onnxruntime-gpu
5
+ ftfy
6
+ timm
example_node.py.example ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Example:
2
+ """
3
+ A example node
4
+
5
+ Class methods
6
+ -------------
7
+ INPUT_TYPES (dict):
8
+ Tell the main program input parameters of nodes.
9
+ IS_CHANGED:
10
+ optional method to control when the node is re executed.
11
+
12
+ Attributes
13
+ ----------
14
+ RETURN_TYPES (`tuple`):
15
+ The type of each element in the output tuple.
16
+ RETURN_NAMES (`tuple`):
17
+ Optional: The name of each output in the output tuple.
18
+ FUNCTION (`str`):
19
+ The name of the entry-point method. For example, if `FUNCTION = "execute"` then it will run Example().execute()
20
+ OUTPUT_NODE ([`bool`]):
21
+ If this node is an output node that outputs a result/image from the graph. The SaveImage node is an example.
22
+ The backend iterates on these output nodes and tries to execute all their parents if their parent graph is properly connected.
23
+ Assumed to be False if not present.
24
+ CATEGORY (`str`):
25
+ The category the node should appear in the UI.
26
+ DEPRECATED (`bool`):
27
+ Indicates whether the node is deprecated. Deprecated nodes are hidden by default in the UI, but remain
28
+ functional in existing workflows that use them.
29
+ EXPERIMENTAL (`bool`):
30
+ Indicates whether the node is experimental. Experimental nodes are marked as such in the UI and may be subject to
31
+ significant changes or removal in future versions. Use with caution in production workflows.
32
+ execute(s) -> tuple || None:
33
+ The entry point method. The name of this method must be the same as the value of property `FUNCTION`.
34
+ For example, if `FUNCTION = "execute"` then this method's name must be `execute`, if `FUNCTION = "foo"` then it must be `foo`.
35
+ """
36
+ def __init__(self):
37
+ pass
38
+
39
+ @classmethod
40
+ def INPUT_TYPES(s):
41
+ """
42
+ Return a dictionary which contains config for all input fields.
43
+ Some types (string): "MODEL", "VAE", "CLIP", "CONDITIONING", "LATENT", "IMAGE", "INT", "STRING", "FLOAT".
44
+ Input types "INT", "STRING" or "FLOAT" are special values for fields on the node.
45
+ The type can be a list for selection.
46
+
47
+ Returns: `dict`:
48
+ - Key input_fields_group (`string`): Can be either required, hidden or optional. A node class must have property `required`
49
+ - Value input_fields (`dict`): Contains input fields config:
50
+ * Key field_name (`string`): Name of a entry-point method's argument
51
+ * Value field_config (`tuple`):
52
+ + First value is a string indicate the type of field or a list for selection.
53
+ + Second value is a config for type "INT", "STRING" or "FLOAT".
54
+ """
55
+ return {
56
+ "required": {
57
+ "image": ("IMAGE",),
58
+ "int_field": ("INT", {
59
+ "default": 0,
60
+ "min": 0, #Minimum value
61
+ "max": 4096, #Maximum value
62
+ "step": 64, #Slider's step
63
+ "display": "number", # Cosmetic only: display as "number" or "slider"
64
+ "lazy": True # Will only be evaluated if check_lazy_status requires it
65
+ }),
66
+ "float_field": ("FLOAT", {
67
+ "default": 1.0,
68
+ "min": 0.0,
69
+ "max": 10.0,
70
+ "step": 0.01,
71
+ "round": 0.001, #The value representing the precision to round to, will be set to the step value by default. Can be set to False to disable rounding.
72
+ "display": "number",
73
+ "lazy": True
74
+ }),
75
+ "print_to_screen": (["enable", "disable"],),
76
+ "string_field": ("STRING", {
77
+ "multiline": False, #True if you want the field to look like the one on the ClipTextEncode node
78
+ "default": "Hello World!",
79
+ "lazy": True
80
+ }),
81
+ },
82
+ }
83
+
84
+ RETURN_TYPES = ("IMAGE",)
85
+ #RETURN_NAMES = ("image_output_name",)
86
+
87
+ FUNCTION = "test"
88
+
89
+ #OUTPUT_NODE = False
90
+
91
+ CATEGORY = "Example"
92
+
93
+ def check_lazy_status(self, image, string_field, int_field, float_field, print_to_screen):
94
+ """
95
+ Return a list of input names that need to be evaluated.
96
+
97
+ This function will be called if there are any lazy inputs which have not yet been
98
+ evaluated. As long as you return at least one field which has not yet been evaluated
99
+ (and more exist), this function will be called again once the value of the requested
100
+ field is available.
101
+
102
+ Any evaluated inputs will be passed as arguments to this function. Any unevaluated
103
+ inputs will have the value None.
104
+ """
105
+ if print_to_screen == "enable":
106
+ return ["int_field", "float_field", "string_field"]
107
+ else:
108
+ return []
109
+
110
+ def test(self, image, string_field, int_field, float_field, print_to_screen):
111
+ if print_to_screen == "enable":
112
+ print(f"""Your input contains:
113
+ string_field aka input text: {string_field}
114
+ int_field: {int_field}
115
+ float_field: {float_field}
116
+ """)
117
+ #do some processing on the image, in this example I just invert it
118
+ image = 1.0 - image
119
+ return (image,)
120
+
121
+ """
122
+ The node will always be re executed if any of the inputs change but
123
+ this method can be used to force the node to execute again even when the inputs don't change.
124
+ You can make this node return a number or a string. This value will be compared to the one returned the last time the node was
125
+ executed, if it is different the node will be executed again.
126
+ This method is used in the core repo for the LoadImage node where they return the image hash as a string, if the image hash
127
+ changes between executions the LoadImage node is executed again.
128
+ """
129
+ #@classmethod
130
+ #def IS_CHANGED(s, image, string_field, int_field, float_field, print_to_screen):
131
+ # return ""
132
+
133
+ # Set the web directory, any .js file in that directory will be loaded by the frontend as a frontend extension
134
+ # WEB_DIRECTORY = "./somejs"
135
+
136
+
137
+ # Add custom API routes, using router
138
+ from aiohttp import web
139
+ from server import PromptServer
140
+
141
+ @PromptServer.instance.routes.get("/hello")
142
+ async def get_hello(request):
143
+ return web.json_response("hello")
144
+
145
+
146
+ # A dictionary that contains all nodes you want to export with their names
147
+ # NOTE: names should be globally unique
148
+ NODE_CLASS_MAPPINGS = {
149
+ "Example": Example
150
+ }
151
+
152
+ # A dictionary that contains the friendly/humanly readable titles for the nodes
153
+ NODE_DISPLAY_NAME_MAPPINGS = {
154
+ "Example": "Example Node"
155
+ }
rgthree-comfy/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Regis Gaughan, III (rgthree)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
rgthree-comfy/README.md ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <h1 align="center">
2
+ rgthree-comfy
3
+ <br>
4
+ <sub><sup><i>Making ComfyUI more comfortable!</i></sup></sub>
5
+ <br>
6
+ </h1>
7
+ <p align="center">
8
+ <a href="#️-the-nodes">The Nodes</a> &nbsp; | &nbsp; <a href="#-improvements--features">Improvements & Features</a> &nbsp; | &nbsp; <a href="#-link-fixer">Link Fixer</a>
9
+ </p>
10
+ <hr>
11
+
12
+ A collection of nodes and improvements created while messing around with ComfyUI. I made them for myself to make my workflow cleaner, easier, and faster. You're welcome to try them out. But remember, I made them for my own use cases :)
13
+
14
+ ![Context Node](./docs/rgthree_advanced.png)
15
+
16
+ # Get Started
17
+
18
+ ## Install
19
+
20
+ 1. Install the great [ComfyUi](https://github.com/comfyanonymous/ComfyUI).
21
+ 2. Clone this repo into `custom_modules`:
22
+ ```
23
+ cd ComfyUI/custom_nodes
24
+ git clone https://github.com/rgthree/rgthree-comfy.git
25
+ ```
26
+ 3. Start up ComfyUI.
27
+
28
+ ## Settings
29
+
30
+ You can configure certain aspect of rgthree-comfy. For instance, perhaps a future ComfyUI change breaks rgthree-comfy, or you already have another extension that does something similar and you want to turn it off for rgthree-comfy.
31
+
32
+ You can get to rgthree-settings by right-clicking on the empty part of the graph, and selecting `rgthree-comfy > Settings (rgthree-comfy)` or by clicking the `rgthree-comfy settings` in the ComfyUI settings dialog.
33
+
34
+ _(Note, settings are stored in an `rgthree_config.json` in the `rgthree-comfy` directory. There are other advanced settings that can only be configured there; You can copy default settings from `rgthree_config.json.default` before `rgthree_config.json` before modifying)_.
35
+
36
+ <br>
37
+
38
+ # ✴️ The Nodes
39
+
40
+ Note, you can right-click on a bunch of the rgthree-comfy nodes and select `🛟 Node Help` menu item for in-app help when available.
41
+
42
+ ## Seed
43
+ > An intuitive seed control node for ComfyUI that works very much like Automatic1111's seed control.
44
+ > <details>
45
+ > <summary>ℹ️ <i>See More Information</i></summary>
46
+ >
47
+ > - Set the seed value to "-1" to use a random seed every time
48
+ > - Set any other number in there to use as a static/fixed seed
49
+ > - Quick actions to randomize, or (re-)use the last queued seed.
50
+ > - Images metadata will store the seed value _(so dragging an image in, will have the seed field already fixed to its seed)_.
51
+ > - _Secret Features_: You can manually set the seed value to "-2" or "-3" to increment or decrement the last seed value. If there was not last seed value, it will randomly use on first.
52
+ >
53
+ > ![Router Node](./docs/rgthree_seed.png)
54
+ > </details>
55
+
56
+
57
+ ## Reroute
58
+ > Keep your workflow neat with this much improved Reroute node with, like, actual rerouting with multiple directions and sizes.
59
+ > <details>
60
+ > <summary>ℹ️ <i>More Information</i></summary>
61
+ >
62
+ > - Use the right-click context menu to change the width, height and connection layout
63
+ > - Also toggle resizability (min size is 40x43 if resizing though), and title/type display.
64
+ >
65
+ > ![Router Node](./docs/rgthree_router.png)
66
+ > </details>
67
+
68
+ ## Bookmark (🔖)
69
+ > Place the bookmark node anywhere on screen to quickly navigate to that with a shortcut key.
70
+ > <details>
71
+ > <summary>ℹ️ <i>See More Information</i></summary>
72
+ >
73
+ > - Define the `shortcut_key` to press to go right to that bookmark node, anchored in the top left.
74
+ > - You can also define the zoom level as well!
75
+ > - Pro tip: `shortcut_key` can be multiple keys. For instance "alt + shift + !" would require
76
+ > pressing the alt key, the shift key, and the "!" (as in the "1" key, but with shift pressed)
77
+ > in order to trigger.
78
+ > </details>
79
+
80
+
81
+ ## Context / Context Big
82
+ > Pass along in general flow properties, and merge in new data. Similar to some other node suites "pipes" but easier merging, is more easily interoperable with standard nodes by both combining and exploding all in a single node.
83
+ > <details>
84
+ > <summary>ℹ️ <i>More Information</i></summary>
85
+ >
86
+ > - Context and Context Big are backwards compatible with each other. That is, an input connected to a Context Big will be passed through the CONTEXT outputs through normal Context nodes and available as an output on either (or, Context Big if the output is only on that node, like "steps").
87
+ > - Pro Tip: When dragging a Context output over a nother node, hold down "ctrl" and release to automatically connect the other Context outputs to the hovered node.
88
+ > - Pro Tip: You can change between Context and Context Big nodes from the menu.
89
+ >
90
+ > ![Context Node](./docs/rgthree_context.png)
91
+ > </details>
92
+
93
+ ## Image Comparer
94
+ > The Image Comparer node compares two images on top of each other.
95
+ > <details>
96
+ > <summary>ℹ️ <i>More Information</i></summary>
97
+ >
98
+ > - **Note:** The right-click menu may show image options (Open Image, Save Image, etc.) which will correspond to the first image (image_a) if clicked on the left-half of the node, or the second image if on the right half of the node.
99
+ > - **Inputs:**
100
+ > - `image_a` _Required._ The first image to use to compare. If image_b is not supplied and image_a is a batch, the comparer will use the first two images of image_a.
101
+ > - `image_b` _Optional._ The second image to use to compare. Optional only if image_a is a batch with two images.
102
+ > - **Properties:** You can change the following properties (by right-clicking on the node, and select "Properties" or "Properties Panel" from the menu):
103
+ > - `comparer_mode` - Choose between "Slide" and "Click". Defaults to "Slide".
104
+
105
+
106
+ ## Image Inset Crop
107
+ > The node that lets you crop an input image by either pixel value, or percentage value.
108
+
109
+
110
+ ## Display Any
111
+ > Displays most any piece of text data from the backend _after execution_.
112
+
113
+ ## Power Lora Loader
114
+ > A super-simply Lora Loader node that can load multiple Loras at once, and quick toggle each, all in an ultra-condensed node.
115
+ > <details>
116
+ > <summary>ℹ️ <i>More Information</i></summary>
117
+ >
118
+ > - Add as many Lora's as you would like by clicking the "+ Add Lora" button. There's no real limit!
119
+ > - Right-click on a Lora widget for special options to move the lora up or down
120
+ > _(no affect on image, just presentation)_, toggle it on/off, or delete the row all together.
121
+ > - from the properties, change the `Show Strengths` to choose between showing a single, simple
122
+ > strength value (which will be used for both model and clip), or a more advanced view with
123
+ > both model and clip strengths being modifiable.
124
+ > </details>
125
+
126
+
127
+ ## ~~Lora Loader Stack~~
128
+ > _**Deprecated.** Used the `Power Lora Loader` instead._
129
+ >
130
+ > A simplified Lora Loader stack. Much like other suites, but more interoperable with standard inputs/outputs.
131
+
132
+
133
+ ## Power Prompt
134
+ > Power up your prompt and get drop downs for adding your embeddings, loras, and even have saved prompt snippets.
135
+ > <details>
136
+ > <summary>ℹ️ <i>More Information</i></summary>
137
+ >
138
+ > - At the core, you can use Power Prompt almost as a String Primitive node with additional features of dropdowns for choosing your embeddings, and even loras, with no further processing. This will output just the raw `TEXT` to another node for any lora processing, CLIP Encoding, etc.
139
+ > - Connect a `CLIP` to the input to encode the text, with both the `CLIP` and `CONDITIONING` output right from the node.
140
+ > - Connect a `MODEL` to the input to parse and load any `<lora:...>` tags in the text automatically, without
141
+ > needing a separate Lora Loaders
142
+ > </details>
143
+
144
+ ## Power Prompt - Simple
145
+ > Same as Power Prompt above, but without LORA support; made for a slightly cleaner negative prompt _(since negative prompts do not support loras)_.
146
+
147
+ ## SDXL Power Prompt - Positive
148
+ > The SDXL sibling to the Power Prompt above. It contains the text_g and text_l as separate text inputs, as well a couple more input slots necessary to ensure proper clipe encoding. Combine with
149
+
150
+ ## SDXL Power Prompt - Simple
151
+ > Like the non-SDXL `Power Prompt - Simple` node, this one is essentially the same as the SDXL Power Prompt but without lora support for either non-lora positive prompts or SDXL negative prompts _(since negative prompts do not support loras)_.
152
+
153
+ ## SDXL Config
154
+ > Just some configuration fields for SDXL prompting. Honestly, could be used for non SDXL too.
155
+
156
+ ## Context Switch / Context Switch Big
157
+ > A powerful node to branch your workflow. Works by choosing the first Context input that is not null/empty.
158
+ > <details>
159
+ > <summary>ℹ️ <i>More Information</i></summary>
160
+ >
161
+ > - Pass in several context nodes and the Context Switch will automatically choose the first non-null context to continue onward with.
162
+ > - Wondering how to toggle contexts to null? Use in conjuction with the **Fast Muter** or **Fast Groups Muter**
163
+ >
164
+ > </details>
165
+
166
+ ## Any Switch
167
+ > A powerful node to similar to the Context Switch above, that chooses the first input that is not null/empty.
168
+ > <details>
169
+ > <summary>ℹ️ <i>More Information</i></summary>
170
+ >
171
+ > - Pass in several inmputs of the same type and the Any Switch will automatically choose the first non-null value to continue onward with.
172
+ > - Wondering how to toggle contexts to null? Use in conjuction with the **Fast Muter** or **Fast Groups Muter**
173
+ >
174
+ > </details>
175
+
176
+
177
+ ## Fast Groups Muter
178
+ > The Fast Groups Muter is an input-less node that automatically collects all groups in your current workflow and allows you to quickly mute and unmute all nodes within the group.
179
+ > <details>
180
+ > <summary>ℹ️ <i>More Information</i></summary>
181
+ >
182
+ > - Groups will automatically be shown, though you can filter, sort and more from the **node Properties** _(by right-clicking on the node, and select "Properties" or "Properties Panel" from the menu)_. Properties include:
183
+ > - `matchColors` - Only add groups that match the provided colors. Can be ComfyUI colors (red, pale_blue) or hex codes (#a4d399). Multiple can be added, comma delimited.
184
+ > - `matchTitle` - Filter the list of toggles by title match (string match, or regular expression).
185
+ > - `showNav` - Add / remove a quick navigation arrow to take you to the group. (default: true)
186
+ > - `sort` - Sort the toggles' order by "alphanumeric", graph "position", or "custom alphabet". (default: "position")
187
+ > - `customSortAlphabet` - When the sort property is "custom alphabet" you can define the alphabet to use here, which will match the beginning of each group name and sort against it. If group titles do not match any custom alphabet entry, then they will be put after groups that do, ordered alphanumerically.
188
+ >
189
+ > This can be a list of single characters, like "zyxw..." or comma delimited strings for more control, like "sdxl,pro,sd,n,p".
190
+ >
191
+ > Note, when two group title match the same custom alphabet entry, the normal alphanumeric alphabet breaks the tie. For instance, a custom alphabet of "e,s,d" will order groups names like "SDXL, SEGS, Detailer" eventhough the custom alphabet has an "e" before "d" (where one may expect "SE" to be before "SD").
192
+ >
193
+ > To have "SEGS" appear before "SDXL" you can use longer strings. For instance, the custom alphabet value of "se,s,f" would work here.
194
+ > - `toggleRestriction` - Optionally, attempt to restrict the number of widgets that can be enabled to a maximum of one, or always one.
195
+ >
196
+ > _Note: If using "max one" or "always one" then this is only enforced when clicking a toggle on this node; if nodes within groups are changed outside of the initial toggle click, then these restriction will not be enforced, and could result in a state where more than one toggle is enabled. This could also happen if nodes are overlapped with multiple groups._
197
+ > </details>
198
+
199
+ ## Fast Groups Bypasser
200
+ > _Same as **Fast Groups Muter** above, but sets the connected nodes to "Bypass" instead of "Mute"_
201
+
202
+
203
+ ## Fast Muter
204
+ > A powerful 'control panel' node to quickly toggle connected nodes allowing them to quickly be muted or enabled
205
+ > <details>
206
+ > <summary>ℹ️ <i>More Information</i></summary>
207
+ >
208
+ > - Add a collection of all connected nodes allowing a single-spot as a "dashboard" to quickly enable and disable nodes. Two distinct nodes; one for "Muting" connected nodes, and one for "Bypassing" connected nodes.
209
+ > </details>
210
+
211
+
212
+ ## Fast Bypasser
213
+ > Same as Fast Muter but sets the connected nodes to "Bypass"
214
+
215
+ ## Fast Actions Button
216
+ > Oh boy, this node allows you to semi-automate connected nodes and/or ConfyUI.
217
+ > <details>
218
+ > <summary>ℹ️ <i>More Information</i></summary>
219
+ >
220
+ > - Connect nodes and, at the least, mute, bypass or enable them when the button is pressed.
221
+ > - Certain nodes expose additional actions. For instance, the `Seed` node you can set `Randomize Each Time` or `Use Last Queued Seed` when the button is pressed.
222
+ > - Also, from the node properties, set a shortcut key to toggle the button actions, without needing a click!
223
+ > </details>
224
+
225
+
226
+ ## Node Collector
227
+ > Used to cleanup noodles, this will accept any number of input nodes and passes it along to another node.
228
+ >
229
+ > ⚠️ *Currently, this should really only be connected to **Fast Muter**, **Fast Bypasser**, or **Mute / Bypass Relay**.*
230
+
231
+
232
+ ## Mute / Bypass Repeater
233
+ > A powerful node that will dispatch its Mute/Bypass/Active mode to all connected input nodes or, if in a group w/o any connected inputs, will dispatch its Mute/Bypass/Active mode to all nodes in that group.
234
+ > <details>
235
+ > <summary>ℹ️ <i>More Information</i></summary>
236
+ >
237
+ > - 💡 Pro Tip #1: Connect this node's output to a **Fast Muter** or **Fast Bypasser** to have a single toggle there that can mute/bypass/enable many nodes with one click.
238
+ >
239
+ > - 💡 Pro Tip #2: Connect a **Mute / Bypass Relay** node to this node's inputs to have the relay automatically dispatch a mute/bypass/enable change to the repeater.
240
+ > </details>
241
+
242
+
243
+ ## Mute / Bypass Relay
244
+ > An advanced node that, when working with a **Mute / Bypass Repeater**, will relay its input nodes'
245
+ > modes (Mute, Bypass, or Active) to a connected repeater (which would then repeat that mode change
246
+ > to all of its inputs).
247
+ > <details>
248
+ > <summary>ℹ️ <i>More Information</i></summary>
249
+ >
250
+ > - When all connected input nodes are muted, the relay will set a connected repeater to mute (by
251
+ > default).
252
+ > - When all connected input nodes are bypassed, the relay will set a connected repeater to
253
+ > bypass (by default).
254
+ > - When _any_ connected input nodes are active, the relay will set a connected repeater to
255
+ > active (by default).
256
+ > - **Note:** If no inputs are connected, the relay will set a connected repeater to its mode
257
+ > _when its own mode is changed_. **Note**, if any inputs are connected, then the above bullets
258
+ > will occur and the Relay's mode does not matter.
259
+ > - **Pro Tip:** You can change which signals get sent on the above in the `Properties`.
260
+ > For instance, you could configure an inverse relay which will send a MUTE when any of its
261
+ > inputs are active (instead of sending an ACTIVE signal), and send an ACTIVE signal when all
262
+ > of its inputs are muted (instead of sending a MUTE signal), etc.
263
+ > </details>
264
+
265
+
266
+ ## Random Unmuter
267
+ > An advanced node used to unmute one of its inputs randomly when the graph is queued (and, immediately mute it back).
268
+ > <details>
269
+ > <summary>ℹ️ <i>More Information</i></summary>
270
+ >
271
+ > - **Note:** All input nodes MUST be muted to start; if not this node will not randomly unmute another. (This is powerful, as the generated image can be dragged in and the chosen input will already by unmuted and work w/o any further action.)
272
+ > - **Tip:** Connect a Repeater's output to this nodes input and place that Repeater on a group without any other inputs, and it will mute/unmute the entire group.
273
+ > </details>
274
+
275
+
276
+ ## Label
277
+ > A purely visual node, this allows you to add a floating label to your workflow.
278
+ > <details>
279
+ > <summary>ℹ️ <i>More Information</i></summary>
280
+ >
281
+ > - The text shown is the "Title" of the node and you can adjust the the font size, font family,
282
+ > font color, text alignment as well as a background color, padding, and background border
283
+ > radius from the node's properties. You can double-click the node to open the properties
284
+ > panel.
285
+ > - **Pro Tip #1:** You can add multiline text from the properties panel _(because ComfyUI let's
286
+ > you shift + enter there, only)._
287
+ > - **Pro Tip #2:** You can use ComfyUI's native "pin" option in the right-click menu to make the
288
+ > label stick to the workflow and clicks to "go through". You can right-click at any time to
289
+ > unpin.
290
+ > - **Pro Tip #3:** Color values are hexidecimal strings, like "#FFFFFF" for white, or "#660000"
291
+ > for dark red. You can supply a 7th & 8th value (or 5th if using shorthand) to create a
292
+ > transluscent color. For instance, "#FFFFFF88" is semi-transparent white.
293
+ > </details>
294
+
295
+
296
+ # Advanced Techniques
297
+
298
+ ## First, a word on muting
299
+
300
+ A lot of the power of these nodes comes from *Muting*. Muting is the basis of correctly implementing multiple paths for a workflow utlizing the Context Switch node.
301
+
302
+ While other extensions may provide switches, they often get it wrong causing your workflow to do more work than is needed. While other switches may have a selector to choose which input to pass along, they don't stop the execution of the other inputs, which will result in wasted work. Instead, Context Switch works by choosing the first non-empty context to pass along and correctly Muting is one way to make a previous node empty, and causes no extra work to be done when set up correctly.
303
+
304
+ ### To understand muting, is to understand the graph flow
305
+
306
+ Muting, and therefore using Switches, can often confuse people at first because it _feels_ like muting a node, or using a switch, should be able to stop or direct the _forward_ flow of the graph. However, this is not the case and, in fact, the graph actually starts working backwards.
307
+
308
+ If you have a workflow that has a path like `... > Context > KSampler > VAE Decode > Save Image` it may initially _feel_ like you should be able to mute that first Context node and the graph would stop there when moving forward and skip the rest of that workflow.
309
+
310
+ But you'll quickly find that will cause an error, becase the graph doesn't actually move forward. When a workflow is processed, it _first moves backwards_ starting at each "Output Node" (Preview Image, Save Image, even "Display String" etc.) and then walking backwards to all possible paths to get there.
311
+
312
+ So, with that `... > Context > KSampler > VAE Decode > Save Image` example from above, we actually want to mute the `Save Image` node to stop this path. Once we do, since the output node is gone, none of these nodes will be run.
313
+
314
+ Let's take a look at an example.
315
+
316
+ ### A powerful combination: Using Context, Context Switch, & Fast Muter
317
+
318
+ ![Context Node](./docs/rgthree_advanced.png)
319
+
320
+ 1. Using the **Context Switch** (aqua colored in screenshot) feed context inputs in order of preference. In the workflow above, the `Upscale Out` context is first so, if that one is enabled, it will be chosen for the output. If not, the second input slot which comes from the context rerouted from above (before the Upscaler booth) will be chosen.
321
+
322
+ - Notice the `Upscale Preview` is _after_ the `Upscale Out` context node, using the image from it instead of the image from the upscale `VAE Decoder`. This is on purpose so, when we disable the `Upscale Out` context, none of the Upscaler nodes will run, saving precious GPU cycles. If we had the preview hooked up directly to the `VAE Decoder` the upscaler would always run to generate the preview, even if we had the `Upscale Out` context node disabled.
323
+
324
+ 2. We can now disable the `Upscale Out` context node by _muting_ it. Highlighting it and pressing `ctrl + m` will work. By doing so, it's output will be None, and it will not pass anthing onto the further nodes. In the diagram you can see the `Upscale Preview` is red, but that's OK; there are no actual errors to stop execution.
325
+
326
+ 3. Now, let's hook it up to the `Fast Muter` node. `The Fast Muter` node works as dashboard by adding quick toggles for any connected node (ignoring reroutes). In the diagram, we have both the `Upscaler Out` context node, and the `Save File` context node hooked up. So, we can quickly enable and disable those.
327
+
328
+ - The workflow seen here would be a common one where we can generate a handful of base previews cheaply with a random seed, and then choose one to upscale and save to disk.
329
+
330
+ 4. Lastly, and optionally, you can see the `Node Collector`. Use it to clean up noodles if you want and connect it to the muter. You can connect anything to it, but doing so may break your workflow's execution.
331
+
332
+ <br>
333
+
334
+ # ⚡ Improvements & Features
335
+
336
+ rgthree-comfy adds several improvements, features, and optimizations to ComfyUI that are not directly tied to nodes.
337
+
338
+ ## Progress Bar
339
+ > A minimal progress bar that run alongs the top of the app window that shows the queue size, the current progress of the a prompt execution (within the same window), and the progress of multi-step nodes as well.
340
+ >
341
+ > <i>You can remove/enable from rgthree-comfy settings, as well as configure the height/size.</i>
342
+
343
+
344
+ ## ComfyUI Recursive Optimization
345
+ > An optimization to ComfyUI's recursive execution. Because rgthree-comfy nodes make it easy to build larger, more complex workflows, I (and others) started to hit a wall of poor execution times.
346
+ > <details>
347
+ > <summary>ℹ️ <i>More Information</i></summary>
348
+ >
349
+ > - Until [ComfyUI/issues/1502](https://github.com/comfyanonymous/ComfyUI/issues/1502) is resolved and/or [ComfyUI/pull/1503](https://github.com/comfyanonymous/ComfyUI/pull/1503) is pulled in, then know that you're benefiting from hundreds of millions of saved cycles each run.
350
+ >
351
+ > - Specifically, for a rather complex test workflow, the patch reduces iterations of `recursive_will_execute` from 113,292,566 to just 135 (and 116.32 seconds to 69.84 seconds on my machine) on a fresh queue, and reduces recursive calls of `recursive_output_delete_if_changed` from 250,496,808 to 142 (and 158.13 seconds to 0.0 seconds on my machine).
352
+ >
353
+ > - ⚠️ *However,* there is a chance ComfyUI changes something in/around the code I patched which could break. If that's the case, you should disable the optimization from rgthree-comfy settings.
354
+ >
355
+ > </details>
356
+
357
+
358
+ ## "Queue Selected Output Nodes" in right-click menu
359
+ > Sometimes you want to just queue one or two paths to specific output node(s) without executing the entire workflow. Well, now you can do just that by right-clicking on an output node and selecting `Queue Selected Output Nodes (rgthree)`.
360
+ >
361
+ > <details>
362
+ > <summary>ℹ️ <i>More Information</i></summary>
363
+ >
364
+ > - Select the _output_ nodes you want to execute.
365
+ >
366
+ > - Note: Only output nodes are captured and traversed, not all selected nodes. So if you select an output AND a node from a different path, only the path connected to the output will be executed and not non-output nodes, even if they were selected.
367
+ >
368
+ > - Note: The whole workflow is serialized, and then we trim what we don't want for the backend. So things like all seed random/increment/decrement will run even if that node isn't being sent in the end, etc.
369
+ >
370
+ > </details>
371
+
372
+
373
+ ## Auto-Nest Subdirectories in long Combos
374
+ > _(Off by default while experimenting, turn on in rgthree-comfy settings)_.
375
+ >
376
+ > Automatically detect top-level subdirectories in long combo lists (like, Load Checkpoint) and break out into sub directories.
377
+
378
+
379
+ ## Quick Mute/Bypass Toggles in Group Headers
380
+ > _(Off by default while experimenting, turn on in rgthree-comfy settings)_.
381
+ >
382
+ > Adds a mute and/or bypass toggle icons in the top-right of Group Headers for one-click toggling of groups you may be currently looking at.
383
+
384
+
385
+ ## Import Individual Node Widgets (Drag & Drop)
386
+ > _(Off by default while experimenting, turn on in rgthree-comfy settings)_.
387
+ >
388
+ > Allows dragging and dropping an image/JSON workflow from a previous generation and overriding the same node's widgets
389
+ > (that match with the same id & type). This is useful if you have several generations using the same general workflow
390
+ > and would like to import just some data, like a previous generation's seed, or prompt, etc.
391
+
392
+
393
+
394
+ ## "Copy Image" in right-click menu
395
+ > Right clicking on a node that has an image should have a context-menu item of "Copy Image" will allow you to copy the image right to your clipboard
396
+ >
397
+ > <i>🎓 I believe this has graduated, with ComfyUI recently adding this setting too. You won't get two menu items; my code checks that there isn't already a "Copy Image" item there before adding it.</i>
398
+
399
+
400
+ ## Other/Smaller Fixes
401
+ - Fixed the width of ultra-wide node chooser on double click.
402
+ - Fixed z-indexes for textareas that would overlap above other elements, like Properties Panel, or @pythongosssss's image viewer.
403
+ - Check for bad links when loading a workflow and log to console, by default. _(See Link Fixer below)._
404
+
405
+ <br>
406
+
407
+ # 📄 Link Fixer
408
+
409
+ If your workflows sometimes have missing connections, or even errors on load, start up ComfyUI and go to http://127.0.0.1:8188/rgthree/link_fixer which will allow you to drop in an image or workflow json file and check for and fix any bad links.
410
+
411
+ You can also enable a link fixer check in the rgthree-comfy settings to give you an alert if you load a workflow with bad linking data to start.
rgthree-comfy/__build__.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import subprocess
4
+ import os
5
+ from shutil import rmtree, copytree, ignore_patterns
6
+ from glob import glob
7
+ import time
8
+ import re
9
+ import argparse
10
+
11
+ from py.log import COLORS
12
+ from py.config import RGTHREE_CONFIG
13
+
14
+ start = time.time()
15
+
16
+ parser = argparse.ArgumentParser()
17
+ parser.add_argument("-t", "--with-tests", default=False, action="store_true")
18
+ parser.add_argument("-f", "--fix", default=False, action="store_true")
19
+ args = parser.parse_args()
20
+
21
+ THIS_DIR = os.path.dirname(os.path.abspath(__file__))
22
+ DIR_SRC_WEB = os.path.abspath(f'{THIS_DIR}/src_web/')
23
+ DIR_WEB = os.path.abspath(f'{THIS_DIR}/web/')
24
+ DIR_WEB_COMFYUI = os.path.abspath(f'{DIR_WEB}/comfyui/')
25
+
26
+
27
+ def log_step(msg=None, status=None):
28
+ """ Logs a step keeping track of timing and initial msg. """
29
+ global step_msg # pylint: disable=W0601
30
+ global step_start # pylint: disable=W0601
31
+ global step_warns # pylint: disable=W0601
32
+ if msg:
33
+ tag = f'{COLORS["YELLOW"]}[ Notice ]' if status == 'Notice' else f'{COLORS["RESET"]}[Starting]'
34
+ step_msg = f'▻ {tag}{COLORS["RESET"]} {msg}...'
35
+ step_start = time.time()
36
+ step_warns = []
37
+ print(step_msg, end="\r")
38
+ elif status:
39
+ if status != 'Error':
40
+ status = "Warn" if len(step_warns) > 0 else status
41
+ step_time = round(time.time() - step_start, 3)
42
+ if status == 'Error':
43
+ status_msg = f'{COLORS["RED"]}⤫ {status}{COLORS["RESET"]}'
44
+ elif status == 'Warn':
45
+ status_msg = f'{COLORS["YELLOW"]}! {status}{COLORS["RESET"]}'
46
+ else:
47
+ status_msg = f'{COLORS["BRIGHT_GREEN"]}🗸 {status}{COLORS["RESET"]}'
48
+ print(f'{step_msg.ljust(64, ".")} {status_msg} ({step_time}s)')
49
+ for warning in step_warns:
50
+ print(warning)
51
+
52
+
53
+ if args.fix:
54
+ tss = glob(os.path.join(DIR_SRC_WEB, "**", "*.ts"), recursive=True)
55
+ log_step(msg=f'Fixing {len(tss)} ts files')
56
+ for ts in tss:
57
+ with open(ts, 'r', encoding="utf-8") as f:
58
+ content = f.read()
59
+ # (\s*from\s*['"](?!.*[.]js['"]).*?)(['"];) in vscode.
60
+ content, n = re.subn(r'(\s*from [\'"](?!.*[.]js[\'"]).*?)([\'"];)', '\\1.js\\2', content)
61
+ if n > 0:
62
+ filename = os.path.basename(ts)
63
+ step_warns.append(
64
+ f' - {filename} has {n} import{"s" if n > 1 else ""} that do not end in ".js"')
65
+ with open(ts, 'w', encoding="utf-8") as f:
66
+ f.write(content)
67
+ log_step(status="Done")
68
+
69
+ log_step(msg='Copying web directory')
70
+ rmtree(DIR_WEB)
71
+ copytree(DIR_SRC_WEB, DIR_WEB, ignore=ignore_patterns("typings*", "*.ts", "*.scss"))
72
+ log_step(status="Done")
73
+
74
+ ts_version_result = subprocess.run(["node", "./node_modules/typescript/bin/tsc", "-v"],
75
+ capture_output=True,
76
+ text=True,
77
+ check=True)
78
+ ts_version = re.sub(r'^.*Version\s*([\d\.]+).*', 'v\\1', ts_version_result.stdout, flags=re.DOTALL)
79
+
80
+ log_step(msg=f'TypeScript ({ts_version})')
81
+ checked = subprocess.run(["node", "./node_modules/typescript/bin/tsc"], check=True)
82
+ log_step(status="Done")
83
+
84
+ if args.with_tests:
85
+ log_step(msg='Removing directories (KEEPING TESTING)', status="Notice")
86
+ else:
87
+ log_step(msg='Removing uneeded directories')
88
+ test_path = os.path.join(DIR_WEB, 'comfyui', 'tests')
89
+ if os.path.exists(test_path):
90
+ rmtree(test_path)
91
+ rmtree(os.path.join(DIR_WEB, 'comfyui', 'testing'))
92
+ # Always remove the dummy scripts_comfy directory
93
+ rmtree(os.path.join(DIR_WEB, 'scripts_comfy'))
94
+ log_step(status="Done")
95
+
96
+ scsss = glob(os.path.join(DIR_SRC_WEB, "**", "*.scss"), recursive=True)
97
+ log_step(msg=f'SASS for {len(scsss)} files')
98
+ scsss = [i.replace(THIS_DIR, '.') for i in scsss]
99
+ cmds = ["node", "./node_modules/sass/sass"]
100
+ for scss in scsss:
101
+ out = scss.replace('src_web', 'web').replace('.scss', '.css')
102
+ cmds.append(f'{scss}:{out}')
103
+ cmds.append('--no-source-map')
104
+ checked = subprocess.run(cmds, check=True)
105
+ log_step(status="Done")
106
+
107
+ # Handle the common directories. Because ComfyUI loads under /extensions/rgthree-comfy we can't
108
+ # easily share sources outside of the `DIR_WEB_COMFYUI` _and_ allow typescript to resolve them in
109
+ # src view, so we set the path in the tsconfig to map an import of "rgthree/common" to the
110
+ # "src_web/common" directory, but then need to rewrite the comfyui JS files to load from
111
+ # "../../rgthree/common" (which we map correctly in rgthree_server.py).
112
+ log_step(msg='Cleaning Imports')
113
+ js_files = glob(os.path.join(DIR_WEB, '**', '*.js'), recursive=True)
114
+ for file in js_files:
115
+ rel_path = file.replace(f'{DIR_WEB}/', "")
116
+ with open(file, 'r', encoding="utf-8") as f:
117
+ filedata = f.read()
118
+ num = rel_path.count(os.sep)
119
+ if rel_path.startswith('comfyui'):
120
+ filedata = re.sub(r'(from\s+["\'])rgthree/', f'\\1{"../" * (num + 1)}rgthree/', filedata)
121
+ filedata = re.sub(r'(from\s+["\'])scripts/', f'\\1{"../" * (num + 1)}scripts/', filedata)
122
+ else:
123
+ filedata = re.sub(r'(from\s+["\'])rgthree/', f'\\1{"../" * num}', filedata)
124
+ filedata = re.sub(r'(from\s+["\'])scripts/', f'\\1{"../" * (num + 1)}scripts/', filedata)
125
+ filedata, n = re.subn(r'(\s*from [\'"](?!.*[.]js[\'"]).*?)([\'"];)', '\\1.js\\2', filedata)
126
+ if n > 0:
127
+ filename = os.path.basename(file)
128
+ step_warns.append(
129
+ f' - {filename} has {n} import{"s" if n > 1 else ""} that do not end in ".js"')
130
+ with open(file, 'w', encoding="utf-8") as f:
131
+ f.write(filedata)
132
+ log_step(status="Done")
133
+
134
+ print(f'Finished all in {round(time.time() - start, 3)}s')
rgthree-comfy/__init__.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @author: rgthree
3
+ @title: Comfy Nodes
4
+ @nickname: rgthree
5
+ @description: A bunch of nodes I created that I also find useful.
6
+ """
7
+
8
+ from glob import glob
9
+ import json
10
+ import os
11
+ import shutil
12
+ import re
13
+ import random
14
+
15
+ import execution
16
+
17
+ from .py.log import log
18
+ from .py.config import get_config_value
19
+ from .py.rgthree_server import *
20
+
21
+ from .py.context import RgthreeContext
22
+ from .py.context_switch import RgthreeContextSwitch
23
+ from .py.context_switch_big import RgthreeContextSwitchBig
24
+ from .py.display_any import RgthreeDisplayAny, RgthreeDisplayInt
25
+ from .py.lora_stack import RgthreeLoraLoaderStack
26
+ from .py.seed import RgthreeSeed
27
+ from .py.sdxl_empty_latent_image import RgthreeSDXLEmptyLatentImage
28
+ from .py.power_prompt import RgthreePowerPrompt
29
+ from .py.power_prompt_simple import RgthreePowerPromptSimple
30
+ from .py.image_inset_crop import RgthreeImageInsetCrop
31
+ from .py.context_big import RgthreeBigContext
32
+ from .py.dynamic_context import RgthreeDynamicContext
33
+ from .py.dynamic_context_switch import RgthreeDynamicContextSwitch
34
+ from .py.ksampler_config import RgthreeKSamplerConfig
35
+ from .py.sdxl_power_prompt_postive import RgthreeSDXLPowerPromptPositive
36
+ from .py.sdxl_power_prompt_simple import RgthreeSDXLPowerPromptSimple
37
+ from .py.any_switch import RgthreeAnySwitch
38
+ from .py.context_merge import RgthreeContextMerge
39
+ from .py.context_merge_big import RgthreeContextMergeBig
40
+ from .py.image_comparer import RgthreeImageComparer
41
+ from .py.power_lora_loader import RgthreePowerLoraLoader
42
+
43
+ NODE_CLASS_MAPPINGS = {
44
+ RgthreeBigContext.NAME: RgthreeBigContext,
45
+ RgthreeContext.NAME: RgthreeContext,
46
+ RgthreeContextSwitch.NAME: RgthreeContextSwitch,
47
+ RgthreeContextSwitchBig.NAME: RgthreeContextSwitchBig,
48
+ RgthreeContextMerge.NAME: RgthreeContextMerge,
49
+ RgthreeContextMergeBig.NAME: RgthreeContextMergeBig,
50
+ RgthreeDisplayInt.NAME: RgthreeDisplayInt,
51
+ RgthreeDisplayAny.NAME: RgthreeDisplayAny,
52
+ RgthreeLoraLoaderStack.NAME: RgthreeLoraLoaderStack,
53
+ RgthreeSeed.NAME: RgthreeSeed,
54
+ RgthreeImageInsetCrop.NAME: RgthreeImageInsetCrop,
55
+ RgthreePowerPrompt.NAME: RgthreePowerPrompt,
56
+ RgthreePowerPromptSimple.NAME: RgthreePowerPromptSimple,
57
+ RgthreeKSamplerConfig.NAME: RgthreeKSamplerConfig,
58
+ RgthreeSDXLEmptyLatentImage.NAME: RgthreeSDXLEmptyLatentImage,
59
+ RgthreeSDXLPowerPromptPositive.NAME: RgthreeSDXLPowerPromptPositive,
60
+ RgthreeSDXLPowerPromptSimple.NAME: RgthreeSDXLPowerPromptSimple,
61
+ RgthreeAnySwitch.NAME: RgthreeAnySwitch,
62
+ RgthreeImageComparer.NAME: RgthreeImageComparer,
63
+ RgthreePowerLoraLoader.NAME: RgthreePowerLoraLoader,
64
+ }
65
+
66
+ if get_config_value('unreleased.dynamic_context.enabled') is True:
67
+ NODE_CLASS_MAPPINGS[RgthreeDynamicContext.NAME] = RgthreeDynamicContext
68
+ NODE_CLASS_MAPPINGS[RgthreeDynamicContextSwitch.NAME] = RgthreeDynamicContextSwitch
69
+
70
+ # WEB_DIRECTORY is the comfyui nodes directory that ComfyUI will link and auto-load.
71
+ WEB_DIRECTORY = "./web/comfyui"
72
+
73
+ THIS_DIR = os.path.dirname(os.path.abspath(__file__))
74
+ DIR_WEB = os.path.abspath(f'{THIS_DIR}/{WEB_DIRECTORY}')
75
+ DIR_PY = os.path.abspath(f'{THIS_DIR}/py')
76
+
77
+ # remove old directories
78
+ OLD_DIRS = [
79
+ os.path.abspath(f'{THIS_DIR}/../../web/extensions/rgthree'),
80
+ os.path.abspath(f'{THIS_DIR}/../../web/extensions/rgthree-comfy'),
81
+ ]
82
+ for old_dir in OLD_DIRS:
83
+ if os.path.exists(old_dir):
84
+ shutil.rmtree(old_dir)
85
+
86
+ __all__ = ['NODE_CLASS_MAPPINGS', 'WEB_DIRECTORY']
87
+
88
+ NOT_NODES = ['constants', 'log', 'utils', 'rgthree', 'rgthree_server', 'image_clipbaord', 'config']
89
+
90
+ nodes = []
91
+ for file in glob(os.path.join(DIR_PY, '*.py')) + glob(os.path.join(DIR_WEB, '*.js')):
92
+ name = os.path.splitext(os.path.basename(file))[0]
93
+ if name in NOT_NODES or name in nodes:
94
+ continue
95
+ if name.startswith('_') or name.startswith('base') or 'utils' in name:
96
+ continue
97
+ nodes.append(name)
98
+ if name == 'display_any':
99
+ nodes.append('display_int')
100
+
101
+ print()
102
+ adjs = ['exciting', 'extraordinary', 'epic', 'fantastic', 'magnificent']
103
+ log(f'Loaded {len(nodes)} {random.choice(adjs)} nodes.', color='BRIGHT_GREEN')
104
+
105
+ # Alright, I don't like doing this, but until https://github.com/comfyanonymous/ComfyUI/issues/1502
106
+ # and/or https://github.com/comfyanonymous/ComfyUI/pull/1503 is pulled into ComfyUI, we need a way
107
+ # to optimize the recursion that happens on prompt eval. This is particularly important for
108
+ # rgthree nodes because workflows can contain many context nodes, but the problem would exist for
109
+ # other nodes' (like "pipe" nodes, efficieny nodes). With `Context Big` nodes being
110
+ # introduced, the number of input recursion that happens in these methods is exponential with a
111
+ # saving of 1000's of percentage points over.
112
+
113
+ # We'll use this to check if we _can_ patch execution. Other work to change the execution may
114
+ # remove these methods, and we want to ensure people's apps do not break.
115
+ could_patch_execution = (hasattr(execution, 'recursive_output_delete_if_changed') and
116
+ hasattr(execution, 'recursive_will_execute') and
117
+ hasattr(execution.PromptExecutor, 'execute'))
118
+
119
+ if get_config_value('features.patch_recursive_execution') is True:
120
+ if not could_patch_execution:
121
+ log("NOTE: Will NOT use rgthree's optimized recursive execution as ComfyUI has changed.",
122
+ color='YELLOW')
123
+ else:
124
+ log("Will use rgthree's optimized recursive execution.", color='BRIGHT_GREEN')
125
+
126
+
127
+ class RgthreePatchRecursiveExecute_Set_patch_recursive_execution_to_false_if_not_working:
128
+ """A fake 'list' that the caller for recursive_will_execute expects but we override such that
129
+ `len(inst)` will return the count number, and `inst[-1]` will return the unique_id. Since that
130
+ all the caller cares about, we can save several minutes and many MB of ram by simply counting
131
+ numbers instead of concatenating a list of millions (only to count it). However the caller
132
+ expects such a list, so we fake it with this.
133
+
134
+ This mimics the enhancement from https://github.com/rgthree/ComfyUI/commit/50b3fb1 but without
135
+ modifying the execution.py
136
+ """
137
+
138
+ def __init__(self, unique_id):
139
+ self.unique_id = unique_id
140
+ self.count = 0
141
+
142
+ def add(self, value):
143
+ self.count += value
144
+
145
+ def __getitem__(self, key):
146
+ """Returns the `unique_id` with '-1' since that's what the caller expects."""
147
+ if key == -1:
148
+ return self.unique_id
149
+ # This one would future proof the proposed changes, in that case "0" is the count
150
+ if key == 0:
151
+ return self.count
152
+ else:
153
+ return -1
154
+
155
+ def __len__(self):
156
+ """Returns the "count" of the "list" as if we were building up a list instea of just
157
+ incrementing `count`.
158
+ """
159
+ return self.count
160
+
161
+ # The following (hopefully) future proofs if https://github.com/rgthree/ComfyUI/commit/50b3fb1
162
+ # goes in, which changes from using `len` on a list, to sort directly (and, thus "<" and ">").
163
+ def __gt__(self, other):
164
+ return self.count > other
165
+
166
+ def __lt__(self, other):
167
+ return self.count < other
168
+
169
+ def __str__(self):
170
+ return str((
171
+ self.count,
172
+ self.unique_id,
173
+ ))
174
+
175
+
176
+ # Caches which will be cleared on each run
177
+ execution.rgthree_cache_recursive_output_delete_if_changed_output = {}
178
+ execution.rgthree_cache_recursive_will_execute = {}
179
+ execution.rgthree_is_currently_optimized = False
180
+
181
+
182
+ def rgthree_execute(self, *args, **kwargs):
183
+ """ A patch of ComfyUI's default execution for optimization (or un-optimization) via config."""
184
+ if get_config_value('features.patch_recursive_execution') is True:
185
+
186
+ if could_patch_execution:
187
+ log("Using rgthree's optimized recursive execution.", color='GREEN')
188
+ # When we execute, we'll reset our global cache here.
189
+ execution.rgthree_cache_recursive_output_delete_if_changed_output = {}
190
+ execution.rgthree_cache_recursive_will_execute = {}
191
+
192
+ if not execution.rgthree_is_currently_optimized:
193
+ log("First run patching recursive_output_delete_if_changed and recursive_will_execute.",
194
+ color='GREEN',
195
+ msg_color='RESET')
196
+ log(
197
+ "Note: \33[0mIf execution seems broken due to forward ComfyUI changes, you can disable " +
198
+ "the optimization from rgthree settings in ComfyUI.",
199
+ color='YELLOW')
200
+ execution.rgthree_old_recursive_output_delete_if_changed = execution.recursive_output_delete_if_changed
201
+ execution.recursive_output_delete_if_changed = rgthree_recursive_output_delete_if_changed
202
+
203
+ execution.rgthree_old_recursive_will_execute = execution.recursive_will_execute
204
+ execution.recursive_will_execute = rgthree_recursive_will_execute
205
+ execution.rgthree_is_currently_optimized = True
206
+
207
+ elif execution.rgthree_is_currently_optimized:
208
+ log("Removing optimizations to recursive_output_delete_if_changed and recursive_will_execute.",
209
+ color='YELLOW',
210
+ msg_color='RESET')
211
+ log("You can enable optimization in the rgthree settings in ComfyUI.", color='CYAN')
212
+ execution.recursive_output_delete_if_changed = execution.rgthree_old_recursive_output_delete_if_changed
213
+ execution.recursive_will_execute = execution.rgthree_old_recursive_will_execute
214
+ execution.rgthree_is_currently_optimized = False
215
+
216
+ # We always call the original execute, it's just whether we patch or unpacth first.
217
+ return self.rgthree_old_execute(*args, **kwargs)
218
+
219
+
220
+ # We always patch execute, so we can check if we want to do work. Up in rgthree_execute we will
221
+ # either patch or unpatch recursive_will_execute recursive_output_delete_if_changed at runtime when
222
+ # config changes.
223
+ execution.PromptExecutor.rgthree_old_execute = execution.PromptExecutor.execute
224
+ execution.PromptExecutor.execute = rgthree_execute
225
+
226
+
227
+ def rgthree_recursive_will_execute(prompt, outputs, current_item, *args, **kwargs):
228
+ """Patches recursive_will_execute function to cache the result of each output."""
229
+ unique_id = current_item
230
+ inputs = prompt[unique_id]['inputs']
231
+ will_execute = RgthreePatchRecursiveExecute_Set_patch_recursive_execution_to_false_if_not_working(
232
+ unique_id)
233
+ if unique_id in outputs:
234
+ return will_execute
235
+
236
+ will_execute.add(1)
237
+ for x in inputs:
238
+ input_data = inputs[x]
239
+ if isinstance(input_data, list):
240
+ input_unique_id = input_data[0]
241
+ output_index = input_data[1]
242
+ node_output_cache_key = f'{input_unique_id}.{output_index}'
243
+ will_execute_value = None
244
+ # If this node's output has already been recursively evaluated, then we can reuse.
245
+ if node_output_cache_key in execution.rgthree_cache_recursive_will_execute:
246
+ will_execute_value = execution.rgthree_cache_recursive_will_execute[node_output_cache_key]
247
+ elif input_unique_id not in outputs:
248
+ will_execute_value = execution.recursive_will_execute(prompt, outputs, input_unique_id,
249
+ *args, **kwargs)
250
+ execution.rgthree_cache_recursive_will_execute[node_output_cache_key] = will_execute_value
251
+ if will_execute_value is not None:
252
+ will_execute.add(len(will_execute_value))
253
+ return will_execute
254
+
255
+
256
+ def rgthree_recursive_output_delete_if_changed(prompt, old_prompt, outputs, current_item, *args,
257
+ **kwargs):
258
+ """Patches recursive_output_delete_if_changed function to cache the result of each output."""
259
+ unique_id = current_item
260
+ inputs = prompt[unique_id]['inputs']
261
+ class_type = prompt[unique_id]['class_type']
262
+ class_def = execution.nodes.NODE_CLASS_MAPPINGS[class_type]
263
+
264
+ is_changed_old = ''
265
+ is_changed = ''
266
+ to_delete = False
267
+ if hasattr(class_def, 'IS_CHANGED'):
268
+ if unique_id in old_prompt and 'is_changed' in old_prompt[unique_id]:
269
+ is_changed_old = old_prompt[unique_id]['is_changed']
270
+ if 'is_changed' not in prompt[unique_id]:
271
+ input_data_all = execution.get_input_data(inputs, class_def, unique_id, outputs)
272
+ if input_data_all is not None:
273
+ try:
274
+ #is_changed = class_def.IS_CHANGED(**input_data_all)
275
+ is_changed = execution.map_node_over_list(class_def, input_data_all, "IS_CHANGED")
276
+ prompt[unique_id]['is_changed'] = is_changed
277
+ except:
278
+ to_delete = True
279
+ else:
280
+ is_changed = prompt[unique_id]['is_changed']
281
+
282
+ if unique_id not in outputs:
283
+ return True
284
+
285
+ if not to_delete:
286
+ if is_changed != is_changed_old:
287
+ to_delete = True
288
+ elif unique_id not in old_prompt:
289
+ to_delete = True
290
+ elif inputs == old_prompt[unique_id]['inputs']:
291
+ for x in inputs:
292
+ input_data = inputs[x]
293
+
294
+ if isinstance(input_data, list):
295
+ input_unique_id = input_data[0]
296
+ output_index = input_data[1]
297
+ node_output_cache_key = f'{input_unique_id}.{output_index}'
298
+ # If this node's output has already been recursively evaluated, then we can stop.
299
+ if node_output_cache_key in execution.rgthree_cache_recursive_output_delete_if_changed_output:
300
+ to_delete = execution.rgthree_cache_recursive_output_delete_if_changed_output[
301
+ node_output_cache_key]
302
+ elif input_unique_id in outputs:
303
+ to_delete = execution.recursive_output_delete_if_changed(prompt, old_prompt, outputs,
304
+ input_unique_id, *args,
305
+ **kwargs)
306
+ execution.rgthree_cache_recursive_output_delete_if_changed_output[
307
+ node_output_cache_key] = to_delete
308
+ else:
309
+ to_delete = True
310
+ if to_delete:
311
+ break
312
+ else:
313
+ to_delete = True
314
+
315
+ if to_delete:
316
+ d = outputs.pop(unique_id)
317
+ del d
318
+ return to_delete
319
+
320
+
321
+ print()
rgthree-comfy/__update_comfy__.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ # A nicer output for git pulling custom nodes (and ComfyUI).
4
+ # Quick shell version: ls | xargs -I % sh -c 'echo; echo %; git -C % pull'
5
+
6
+ import os
7
+ from subprocess import Popen, PIPE, STDOUT
8
+
9
+
10
+ def pull_path(path):
11
+ p = Popen(["git", "-C", path, "pull"], stdout=PIPE, stderr=STDOUT)
12
+ output, error = p.communicate()
13
+ return output.decode()
14
+
15
+ THIS_DIR=os.path.dirname(os.path.abspath(__file__))
16
+
17
+ def show_output(output):
18
+ if output.startswith('Already up to date'):
19
+ print(f' \33[32m🗸 {output}\33[0m', end ='')
20
+ elif output.startswith('error:'):
21
+ print(f' \33[31m🞫 Error.\33[0m \n {output}')
22
+ else:
23
+ print(f' \33[33m🡅 Needs update.\33[0m \n {output}', end='')
24
+
25
+
26
+ os.chdir(THIS_DIR)
27
+ os.chdir("../")
28
+
29
+ # Get the list or custom nodes, so we can format the output a little more nicely.
30
+ custom_extensions = []
31
+ custom_extensions_name_max = 0
32
+ for directory in os.listdir(os.getcwd()):
33
+ if os.path.isdir(directory) and directory != "__pycache__": #and directory != "rgthree-comfy" :
34
+ custom_extensions.append({
35
+ 'directory': directory
36
+ })
37
+ if len(directory) > custom_extensions_name_max:
38
+ custom_extensions_name_max = len(directory)
39
+
40
+ if len(custom_extensions) == 0:
41
+ custom_extensions_name_max = 15
42
+ else:
43
+ custom_extensions_name_max += 6
44
+
45
+ # Update ComfyUI itself.
46
+ label = "{0:.<{max}}".format('Updating ComfyUI ', max=custom_extensions_name_max)
47
+ print(label, end = '')
48
+ show_output(pull_path('../'))
49
+
50
+ # If we have custom nodes, update them as well.
51
+ if len(custom_extensions) > 0:
52
+ print(f'\nUpdating custom_nodes ({len(custom_extensions)}):')
53
+ for custom_extension in custom_extensions:
54
+ directory = custom_extension['directory']
55
+ label = "{0:.<{max}}".format(f'🗀 {directory} ', max=custom_extensions_name_max)
56
+ print(label, end = '')
57
+ show_output(pull_path(directory))
rgthree-comfy/docs/rgthree_advanced.png ADDED

Git LFS Details

  • SHA256: 77d88a50847fa76d95a1470bf0315b035a06e3c87a8d4e8132d49d8d5b8a31ce
  • Pointer size: 131 Bytes
  • Size of remote file: 456 kB
rgthree-comfy/docs/rgthree_advanced_metadata.png ADDED

Git LFS Details

  • SHA256: c33be251bd628225b9e29249b766b45539a62a9f94f2e0085b10c469f5ef0956
  • Pointer size: 131 Bytes
  • Size of remote file: 491 kB
rgthree-comfy/docs/rgthree_context.png ADDED

Git LFS Details

  • SHA256: 4d7c4401b0f4b6958d75b9eca531b0c16d8eeb121f3ec8092ccf1336966c43cd
  • Pointer size: 131 Bytes
  • Size of remote file: 544 kB
rgthree-comfy/docs/rgthree_context_metadata.png ADDED

Git LFS Details

  • SHA256: 0f89ef9decfee6b65831780a85235f6bfedd99ff1f16eaaf0567a17d94997ba7
  • Pointer size: 132 Bytes
  • Size of remote file: 1.69 MB