Abdualkader commited on
Commit
72fea7b
·
verified ·
1 Parent(s): 5b718cb

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,53 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ # ImageDream-diffusers Model Card
5
+ This is a port of https://huggingface.co/Peng-Wang/ImageDream into diffusers.
6
+
7
+ And get ported weights from https://huggingface.co/ashawkey/imagedream-ipmv-diffuser
8
+ In ashawkey's work, UNet did not ported to diffusers.
9
+
10
+ This work has been fully ported to diffusers, including UNet.
11
+ And separated the IP-adapter-plus from the unet.
12
+
13
+ ## Diffusers
14
+ ```python
15
+ import torch
16
+ from diffusers import DiffusionPipeline
17
+ from diffusers.utils import make_image_grid
18
+ from PIL import Image
19
+
20
+ pipe = DiffusionPipeline.from_pretrained(
21
+ "kiigii/imagedream-ipmv-diffusers",
22
+ torch_dtype=torch.float16,
23
+ trust_remote_code=True,
24
+ )
25
+ pipe.load_ip_adapter()
26
+ pipe.to("cude")
27
+
28
+ prompt = "" # no need to input prompt
29
+ image = Image.open(...)
30
+
31
+ mv_images = pipe(
32
+ prompt=prompt,
33
+ ip_adapter_image=image,
34
+ guidance_scale=5,
35
+ num_inference_steps=30,
36
+ elevation=0,
37
+ num_images_per_prompt=1
38
+ ).images
39
+ mv_grid = make_image_grid(mv_images[:4], 2, 2)
40
+ mv_grid.save("mv_image.png")
41
+ ```
42
+
43
+ ## Citation
44
+ ```
45
+ @article{wang2023imagedream,
46
+ title={ImageDream: Image-Prompt Multi-view Diffusion for 3D Generation},
47
+ author={Wang, Peng and Shi, Yichun},
48
+ journal={arXiv preprint arXiv:2312.02201},
49
+ year={2023}
50
+ }
51
+ ```
52
+ ## Misuse, Malicious Use, and Out-of-Scope Use
53
+ The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_center_crop",
8
+ "crop_size",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_convert_rgb",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format"
18
+ ],
19
+ "crop_size": {
20
+ "height": 224,
21
+ "width": 224
22
+ },
23
+ "do_center_crop": true,
24
+ "do_convert_rgb": true,
25
+ "do_normalize": true,
26
+ "do_rescale": true,
27
+ "do_resize": true,
28
+ "image_mean": [
29
+ 0.48145466,
30
+ 0.4578275,
31
+ 0.40821073
32
+ ],
33
+ "image_processor_type": "CLIPImageProcessor",
34
+ "image_std": [
35
+ 0.26862954,
36
+ 0.26130258,
37
+ 0.27577711
38
+ ],
39
+ "resample": 3,
40
+ "rescale_factor": 0.00392156862745098,
41
+ "size": {
42
+ "shortest_edge": 224
43
+ }
44
+ }
ip_adapter/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
3
+ "architectures": [
4
+ "CLIPVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "gelu",
9
+ "hidden_size": 1280,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 32,
19
+ "patch_size": 14,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float16",
22
+ "transformers_version": "4.41.2"
23
+ }
ip_adapter/image_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a56cfd4ffcf40be097c430324ec184cc37187f6dafef128ef9225438a3c03c4
3
+ size 1261595704
ip_adapter/ip-adapter-plus_imagedream.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ccadbfaf399f3a0e12eeaef7c1dc3a0002de801bb4d7b134bf85ca3204bcc4b
3
+ size 148229970
model_index.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": [
3
+ "pipeline_mvdiffusion",
4
+ "MVDiffusionPipeline"
5
+ ],
6
+ "_diffusers_version": "0.29.1",
7
+ "feature_extractor": [
8
+ "transformers",
9
+ "CLIPImageProcessor"
10
+ ],
11
+ "image_encoder": [
12
+ null,
13
+ null
14
+ ],
15
+ "requires_safety_checker": false,
16
+ "safety_checker": [
17
+ null,
18
+ null
19
+ ],
20
+ "scheduler": [
21
+ "diffusers",
22
+ "DDIMScheduler"
23
+ ],
24
+ "text_encoder": [
25
+ "transformers",
26
+ "CLIPTextModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "CLIPTokenizer"
31
+ ],
32
+ "unet": [
33
+ "diffusers",
34
+ "UNet2DConditionModel"
35
+ ],
36
+ "vae": [
37
+ "diffusers",
38
+ "AutoencoderKL"
39
+ ]
40
+ }
pipeline_mvdiffusion.py ADDED
@@ -0,0 +1,578 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Callable, Dict, List, Optional, Union
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ try:
9
+ from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
10
+ except:
11
+
12
+ class MultiPipelineCallbacks:
13
+ ...
14
+
15
+ class PipelineCallback:
16
+ ...
17
+
18
+
19
+ from diffusers.image_processor import PipelineImageInput
20
+ from diffusers.models import AutoencoderKL, UNet2DConditionModel
21
+ from diffusers.models.attention import Attention
22
+ from diffusers.models.attention_processor import AttnProcessor2_0
23
+ from diffusers.pipelines.stable_diffusion.pipeline_output import (
24
+ StableDiffusionPipelineOutput,
25
+ )
26
+ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import (
27
+ StableDiffusionPipeline,
28
+ rescale_noise_cfg,
29
+ retrieve_timesteps,
30
+ )
31
+ from diffusers.pipelines.stable_diffusion.safety_checker import (
32
+ StableDiffusionSafetyChecker,
33
+ )
34
+ from diffusers.schedulers import KarrasDiffusionSchedulers
35
+ from diffusers.utils import deprecate
36
+ from transformers import (
37
+ CLIPImageProcessor,
38
+ CLIPTextModel,
39
+ CLIPTokenizer,
40
+ CLIPVisionModel,
41
+ )
42
+
43
+
44
+ class MVDiffusionPipeline(StableDiffusionPipeline):
45
+ def __init__(
46
+ self,
47
+ vae: AutoencoderKL,
48
+ text_encoder: CLIPTextModel,
49
+ tokenizer: CLIPTokenizer,
50
+ unet: UNet2DConditionModel,
51
+ scheduler: KarrasDiffusionSchedulers,
52
+ safety_checker: StableDiffusionSafetyChecker,
53
+ feature_extractor: Optional[CLIPImageProcessor] = None,
54
+ image_encoder: Optional[CLIPVisionModel] = None,
55
+ requires_safety_checker: bool = False,
56
+ ) -> None:
57
+ super().__init__(
58
+ vae=vae,
59
+ text_encoder=text_encoder,
60
+ tokenizer=tokenizer,
61
+ unet=add_mv_attn_processor(unet),
62
+ scheduler=scheduler,
63
+ safety_checker=safety_checker,
64
+ feature_extractor=feature_extractor,
65
+ image_encoder=image_encoder,
66
+ requires_safety_checker=requires_safety_checker,
67
+ )
68
+ self.num_views = 4
69
+
70
+ def load_ip_adapter(
71
+ self,
72
+ pretrained_model_name_or_path_or_dict: Union[
73
+ str, List[str], Dict[str, torch.Tensor]
74
+ ] = "kiigii/imagedream-ipmv-diffusers",
75
+ subfolder: Union[str, List[str]] = "ip_adapter",
76
+ weight_name: Union[str, List[str]] = "ip-adapter-plus_imagedream.bin",
77
+ image_encoder_folder: Optional[str] = "image_encoder",
78
+ **kwargs,
79
+ ) -> None:
80
+ super().load_ip_adapter(
81
+ pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
82
+ subfolder=subfolder,
83
+ weight_name=weight_name,
84
+ image_encoder_folder=image_encoder_folder,
85
+ **kwargs,
86
+ )
87
+ print("IP-Adapter Loaded.")
88
+
89
+ if weight_name == "ip-adapter-plus_imagedream.bin":
90
+ setattr(self.image_encoder, "visual_projection", nn.Identity())
91
+ add_mv_attn_processor(self.unet)
92
+ set_num_views(self.unet, self.num_views + 1)
93
+
94
+ def unload_ip_adapter(self) -> None:
95
+ super().unload_ip_adapter()
96
+ set_num_views(self.unet, self.num_views)
97
+
98
+ def encode_image_to_latents(
99
+ self,
100
+ image: PipelineImageInput,
101
+ height: int,
102
+ width: int,
103
+ device: torch.device,
104
+ num_images_per_prompt: int = 1,
105
+ ):
106
+ dtype = next(self.vae.parameters()).dtype
107
+
108
+ if isinstance(image, torch.Tensor):
109
+ image = F.interpolate(
110
+ image,
111
+ (height, width),
112
+ mode="bilinear",
113
+ align_corners=False,
114
+ antialias=True,
115
+ )
116
+ else:
117
+ image = self.image_processor.preprocess(image, height, width)
118
+
119
+ # image should be in range [-1, 1]
120
+ image = image.to(device=device, dtype=dtype)
121
+
122
+ def vae_encode(image):
123
+ posterior = self.vae.encode(image).latent_dist
124
+ latents = posterior.sample() * self.vae.config.scaling_factor
125
+ latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
126
+ return latents
127
+
128
+ latents = vae_encode(image)
129
+ uncond_latents = vae_encode(torch.zeros_like(image))
130
+ return latents, uncond_latents
131
+
132
+ @torch.no_grad()
133
+ def __call__(
134
+ self,
135
+ prompt: Union[str, List[str]] = None,
136
+ height: Optional[int] = None,
137
+ width: Optional[int] = None,
138
+ num_inference_steps: int = 50,
139
+ elevation: float = 0.0,
140
+ timesteps: List[int] = None,
141
+ sigmas: List[float] = None,
142
+ guidance_scale: float = 5.0,
143
+ negative_prompt: Optional[Union[str, List[str]]] = None,
144
+ num_images_per_prompt: Optional[int] = 1,
145
+ eta: float = 0.0,
146
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
147
+ latents: Optional[torch.Tensor] = None,
148
+ prompt_embeds: Optional[torch.Tensor] = None,
149
+ negative_prompt_embeds: Optional[torch.Tensor] = None,
150
+ ip_adapter_image: Optional[PipelineImageInput] = None,
151
+ # StableDiffusion support `ip_adapter_image_embeds` but we don't use, and raise ValueError.
152
+ ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
153
+ output_type: Optional[str] = "pil",
154
+ return_dict: bool = True,
155
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
156
+ guidance_rescale: float = 0.0,
157
+ clip_skip: Optional[int] = None,
158
+ callback_on_step_end: Optional[
159
+ Union[
160
+ Callable[[int, int, Dict], None],
161
+ PipelineCallback,
162
+ MultiPipelineCallbacks,
163
+ ]
164
+ ] = None,
165
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
166
+ **kwargs,
167
+ ):
168
+ if ip_adapter_image_embeds is not None:
169
+ raise ValueError(
170
+ "do not use `ip_adapter_image_embeds` in ImageDream, use `ip_adapter_image`"
171
+ )
172
+
173
+ callback = kwargs.pop("callback", None)
174
+ callback_steps = kwargs.pop("callback_steps", None)
175
+
176
+ if callback is not None:
177
+ deprecate(
178
+ "callback",
179
+ "1.0.0",
180
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
181
+ )
182
+ if callback_steps is not None:
183
+ deprecate(
184
+ "callback_steps",
185
+ "1.0.0",
186
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
187
+ )
188
+
189
+ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
190
+ callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
191
+
192
+ # ImageDream number of views
193
+ if cross_attention_kwargs is None:
194
+ num_views = self.num_views
195
+ else:
196
+ cross_attention_kwargs.pop("num_views", self.num_views)
197
+
198
+ # 0. Default height and width to unet
199
+ height = height or self.unet.config.sample_size * self.vae_scale_factor
200
+ width = width or self.unet.config.sample_size * self.vae_scale_factor
201
+ # to deal with lora scaling and other possible forward hooks
202
+
203
+ # 1. Check inputs. Raise error if not correct
204
+ if prompt is None:
205
+ prompt = ""
206
+ self.check_inputs(
207
+ prompt,
208
+ height,
209
+ width,
210
+ callback_steps,
211
+ negative_prompt,
212
+ prompt_embeds,
213
+ negative_prompt_embeds,
214
+ ip_adapter_image,
215
+ None, # ip_adapter_image_embeds,
216
+ callback_on_step_end_tensor_inputs,
217
+ )
218
+
219
+ self._guidance_scale = guidance_scale
220
+ self._guidance_rescale = guidance_rescale
221
+ self._clip_skip = clip_skip
222
+ self._cross_attention_kwargs = cross_attention_kwargs
223
+ self._interrupt = False
224
+
225
+ # 2. Define call parameters
226
+ if prompt is not None and isinstance(prompt, str):
227
+ batch_size = 1
228
+ elif prompt is not None and isinstance(prompt, list):
229
+ batch_size = len(prompt)
230
+ else:
231
+ batch_size = prompt_embeds.shape[0]
232
+
233
+ device = self._execution_device
234
+
235
+ # 3. Encode input prompt
236
+ lora_scale = (
237
+ self.cross_attention_kwargs.get("scale", None)
238
+ if self.cross_attention_kwargs is not None
239
+ else None
240
+ )
241
+
242
+ prompt_embeds, negative_prompt_embeds = self.encode_prompt(
243
+ prompt,
244
+ device,
245
+ num_images_per_prompt,
246
+ self.do_classifier_free_guidance,
247
+ negative_prompt,
248
+ prompt_embeds=prompt_embeds,
249
+ negative_prompt_embeds=negative_prompt_embeds,
250
+ lora_scale=lora_scale,
251
+ clip_skip=self.clip_skip,
252
+ )
253
+
254
+ # camera parameter for ImageDream
255
+ camera = get_camera(
256
+ num_views, elevation=elevation, extra_view=ip_adapter_image is not None
257
+ ).to(dtype=prompt_embeds.dtype, device=device)
258
+ camera = camera.repeat(batch_size * num_images_per_prompt, 1)
259
+
260
+ if ip_adapter_image is not None:
261
+ image_embeds = self.prepare_ip_adapter_image_embeds(
262
+ ip_adapter_image,
263
+ None, # ip_adapter_image_embeds,
264
+ device,
265
+ batch_size * num_images_per_prompt,
266
+ self.do_classifier_free_guidance,
267
+ )
268
+ # ImageDream
269
+ image_latents, negative_image_latents = self.encode_image_to_latents(
270
+ ip_adapter_image,
271
+ height,
272
+ width,
273
+ device,
274
+ batch_size * num_images_per_prompt,
275
+ )
276
+ num_views += 1
277
+
278
+ # For classifier free guidance, we need to do two forward passes.
279
+ # Here we concatenate the unconditional and text embeddings into a single batch
280
+ # to avoid doing two forward passes
281
+ if self.do_classifier_free_guidance:
282
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
283
+ camera = torch.cat([camera] * 2)
284
+ if ip_adapter_image is not None:
285
+ image_latents = torch.cat([negative_image_latents, image_latents])
286
+
287
+ # Multi-view inputs for ImageDream.
288
+ prompt_embeds = prompt_embeds.repeat_interleave(num_views, dim=0)
289
+ if ip_adapter_image is not None:
290
+ image_embeds = [i.repeat_interleave(num_views, dim=0) for i in image_embeds]
291
+
292
+ # 4. Prepare timesteps
293
+ timesteps, num_inference_steps = retrieve_timesteps(
294
+ self.scheduler, num_inference_steps, device, timesteps, sigmas
295
+ )
296
+
297
+ # 5. Prepare latent variables
298
+ num_channels_latents = self.unet.config.in_channels
299
+ latents = self.prepare_latents(
300
+ batch_size * num_images_per_prompt * num_views,
301
+ num_channels_latents,
302
+ height,
303
+ width,
304
+ prompt_embeds.dtype,
305
+ device,
306
+ generator,
307
+ latents,
308
+ )
309
+
310
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
311
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
312
+
313
+ # 6.1 Add image embeds for IP-Adapter
314
+ if ip_adapter_image is not None:
315
+ added_cond_kwargs = {"image_embeds": image_embeds}
316
+ else:
317
+ added_cond_kwargs = None
318
+
319
+ # 6.2 Optionally get Guidance Scale Embedding
320
+ timestep_cond = None
321
+ if self.unet.config.time_cond_proj_dim is not None:
322
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
323
+ batch_size * num_images_per_prompt
324
+ )
325
+ timestep_cond = self.get_guidance_scale_embedding(
326
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
327
+ ).to(device=device, dtype=latents.dtype)
328
+
329
+ set_num_views(self.unet, num_views)
330
+
331
+ # fmt: off
332
+ # 7. Denoising loop
333
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
334
+ self._num_timesteps = len(timesteps)
335
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
336
+ for i, t in enumerate(timesteps):
337
+ if self.interrupt:
338
+ continue
339
+
340
+ # expand the latents if we are doing classifier free guidance
341
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
342
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
343
+
344
+ if ip_adapter_image is not None:
345
+ latent_model_input[num_views - 1 :: num_views, :, :, :] = image_latents
346
+ # predict the noise residual
347
+ noise_pred = self.unet(
348
+ latent_model_input,
349
+ t,
350
+ class_labels=camera,
351
+ encoder_hidden_states=prompt_embeds,
352
+ timestep_cond=timestep_cond,
353
+ cross_attention_kwargs=self.cross_attention_kwargs,
354
+ added_cond_kwargs=added_cond_kwargs,
355
+ return_dict=False,
356
+ )[0]
357
+
358
+ # perform guidance
359
+ if self.do_classifier_free_guidance:
360
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
361
+ noise_pred = torch.lerp(noise_pred_uncond, noise_pred_text, self.guidance_scale)
362
+
363
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
364
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
365
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
366
+
367
+ # compute the previous noisy sample x_t -> x_t-1
368
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
369
+
370
+ if callback_on_step_end is not None:
371
+ callback_kwargs = {}
372
+ for k in callback_on_step_end_tensor_inputs:
373
+ callback_kwargs[k] = locals()[k]
374
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
375
+
376
+ latents = callback_outputs.pop("latents", latents)
377
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
378
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
379
+
380
+ # call the callback, if provided
381
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
382
+ progress_bar.update()
383
+ if callback is not None and i % callback_steps == 0:
384
+ step_idx = i // getattr(self.scheduler, "order", 1)
385
+ callback(step_idx, t, latents)
386
+ # fmt: on
387
+ if not output_type == "latent":
388
+ image = self.vae.decode(
389
+ latents / self.vae.config.scaling_factor,
390
+ return_dict=False,
391
+ generator=generator,
392
+ )[0]
393
+ image, has_nsfw_concept = self.run_safety_checker(
394
+ image, device, prompt_embeds.dtype
395
+ )
396
+ else:
397
+ image = latents
398
+ has_nsfw_concept = None
399
+
400
+ if has_nsfw_concept is None:
401
+ do_denormalize = [True] * image.shape[0]
402
+ else:
403
+ do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
404
+
405
+ image = self.image_processor.postprocess(
406
+ image, output_type=output_type, do_denormalize=do_denormalize
407
+ )
408
+
409
+ # Offload all models
410
+ self.maybe_free_model_hooks()
411
+
412
+ if not return_dict:
413
+ return (image, has_nsfw_concept)
414
+
415
+ return StableDiffusionPipelineOutput(
416
+ images=image, nsfw_content_detected=has_nsfw_concept
417
+ )
418
+
419
+
420
+ # fmt: off
421
+ # Copied from ImageDream
422
+ # https://github.com/bytedance/ImageDream/blob/main/extern/ImageDream/imagedream/camera_utils.py
423
+
424
+
425
+ def create_camera_to_world_matrix(elevation, azimuth):
426
+ elevation = np.radians(elevation)
427
+ azimuth = np.radians(azimuth)
428
+ # Convert elevation and azimuth angles to Cartesian coordinates on a unit sphere
429
+ x = np.cos(elevation) * np.sin(azimuth)
430
+ y = np.sin(elevation)
431
+ z = np.cos(elevation) * np.cos(azimuth)
432
+
433
+ # Calculate camera position, target, and up vectors
434
+ camera_pos = np.array([x, y, z])
435
+ target = np.array([0, 0, 0])
436
+ up = np.array([0, 1, 0])
437
+
438
+ # Construct view matrix
439
+ forward = target - camera_pos
440
+ forward /= np.linalg.norm(forward)
441
+ right = np.cross(forward, up)
442
+ right /= np.linalg.norm(right)
443
+ new_up = np.cross(right, forward)
444
+ new_up /= np.linalg.norm(new_up)
445
+ cam2world = np.eye(4)
446
+ cam2world[:3, :3] = np.array([right, new_up, -forward]).T
447
+ cam2world[:3, 3] = camera_pos
448
+ return cam2world
449
+
450
+
451
+ def convert_opengl_to_blender(camera_matrix):
452
+ if isinstance(camera_matrix, np.ndarray):
453
+ # Construct transformation matrix to convert from OpenGL space to Blender space
454
+ flip_yz = np.array([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
455
+ camera_matrix_blender = np.dot(flip_yz, camera_matrix)
456
+ else:
457
+ # Construct transformation matrix to convert from OpenGL space to Blender space
458
+ flip_yz = torch.tensor(
459
+ [[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]
460
+ )
461
+ if camera_matrix.ndim == 3:
462
+ flip_yz = flip_yz.unsqueeze(0)
463
+ camera_matrix_blender = torch.matmul(flip_yz.to(camera_matrix), camera_matrix)
464
+ return camera_matrix_blender
465
+
466
+
467
+ def normalize_camera(camera_matrix):
468
+ """normalize the camera location onto a unit-sphere"""
469
+ if isinstance(camera_matrix, np.ndarray):
470
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
471
+ translation = camera_matrix[:, :3, 3]
472
+ translation = translation / (
473
+ np.linalg.norm(translation, axis=1, keepdims=True) + 1e-8
474
+ )
475
+ camera_matrix[:, :3, 3] = translation
476
+ else:
477
+ camera_matrix = camera_matrix.reshape(-1, 4, 4)
478
+ translation = camera_matrix[:, :3, 3]
479
+ translation = translation / (
480
+ torch.norm(translation, dim=1, keepdim=True) + 1e-8
481
+ )
482
+ camera_matrix[:, :3, 3] = translation
483
+ return camera_matrix.reshape(-1, 16)
484
+
485
+
486
+ def get_camera(
487
+ num_frames,
488
+ elevation=15,
489
+ azimuth_start=0,
490
+ azimuth_span=360,
491
+ blender_coord=True,
492
+ extra_view=False,
493
+ ):
494
+ angle_gap = azimuth_span / num_frames
495
+ cameras = []
496
+ for azimuth in np.arange(azimuth_start, azimuth_span + azimuth_start, angle_gap):
497
+ camera_matrix = create_camera_to_world_matrix(elevation, azimuth)
498
+ if blender_coord:
499
+ camera_matrix = convert_opengl_to_blender(camera_matrix)
500
+ cameras.append(camera_matrix.flatten())
501
+
502
+ if extra_view:
503
+ dim = len(cameras[0])
504
+ cameras.append(np.zeros(dim))
505
+ return torch.tensor(np.stack(cameras, 0)).float()
506
+ # fmt: on
507
+
508
+
509
+ def add_mv_attn_processor(unet: UNet2DConditionModel, num_views: int = 4) -> UNet2DConditionModel:
510
+ attn_procs = {}
511
+ for key, attn_processor in unet.attn_processors.items():
512
+ if "attn1" in key:
513
+ attn_procs[key] = MVAttnProcessor2_0(num_views)
514
+ else:
515
+ attn_procs[key] = attn_processor
516
+ unet.set_attn_processor(attn_procs)
517
+ return unet
518
+
519
+
520
+ def set_num_views(unet: UNet2DConditionModel, num_views: int) -> UNet2DConditionModel:
521
+ for key, attn_processor in unet.attn_processors.items():
522
+ if isinstance(attn_processor, MVAttnProcessor2_0):
523
+ attn_processor.num_views = num_views
524
+ return unet
525
+
526
+
527
+ class MVAttnProcessor2_0(AttnProcessor2_0):
528
+ def __init__(self, num_views: int = 4):
529
+ super().__init__()
530
+ self.num_views = num_views
531
+
532
+ def __call__(
533
+ self,
534
+ attn: Attention,
535
+ hidden_states: torch.Tensor,
536
+ encoder_hidden_states: Optional[torch.Tensor] = None,
537
+ attention_mask: Optional[torch.Tensor] = None,
538
+ temb: Optional[torch.Tensor] = None,
539
+ *args,
540
+ **kwargs,
541
+ ):
542
+ if self.num_views == 1:
543
+ return super().__call__(
544
+ attn=attn,
545
+ hidden_states=hidden_states,
546
+ encoder_hidden_states=encoder_hidden_states,
547
+ attention_mask=attention_mask,
548
+ temb=temb,
549
+ *args,
550
+ **kwargs,
551
+ )
552
+
553
+ input_ndim = hidden_states.ndim
554
+ B = hidden_states.size(0)
555
+ if B % self.num_views:
556
+ raise ValueError(
557
+ f"`batch_size`(got {B}) must be a multiple of `num_views`(got {self.num_views})."
558
+ )
559
+ real_B = B // self.num_views
560
+ if input_ndim == 4:
561
+ H, W = hidden_states.shape[2:]
562
+ hidden_states = hidden_states.reshape(real_B, -1, H, W).transpose(1, 2)
563
+ else:
564
+ hidden_states = hidden_states.reshape(real_B, -1, hidden_states.size(-1))
565
+ hidden_states = super().__call__(
566
+ attn=attn,
567
+ hidden_states=hidden_states,
568
+ encoder_hidden_states=encoder_hidden_states,
569
+ attention_mask=attention_mask,
570
+ temb=temb,
571
+ *args,
572
+ **kwargs,
573
+ )
574
+ if input_ndim == 4:
575
+ hidden_states = hidden_states.transpose(-1, -2).reshape(B, -1, H, W)
576
+ else:
577
+ hidden_states = hidden_states.reshape(B, -1, hidden_states.size(-1))
578
+ return hidden_states
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.29.0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "stabilityai/stable-diffusion-2-1",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float16",
23
+ "transformers_version": "4.41.2",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1827c465450322616f06dea41596eac7d493f4e95904dcb51f0fc745c4e13f
3
+ size 680820392
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.29.0",
4
+ "act_fn": "silu",
5
+ "addition_embed_type": null,
6
+ "addition_embed_type_num_heads": 64,
7
+ "addition_time_embed_dim": null,
8
+ "attention_head_dim": [
9
+ 5,
10
+ 10,
11
+ 20,
12
+ 20
13
+ ],
14
+ "attention_type": "default",
15
+ "block_out_channels": [
16
+ 320,
17
+ 640,
18
+ 1280,
19
+ 1280
20
+ ],
21
+ "center_input_sample": false,
22
+ "class_embed_type": "projection",
23
+ "class_embeddings_concat": false,
24
+ "conv_in_kernel": 3,
25
+ "conv_out_kernel": 3,
26
+ "cross_attention_dim": 1024,
27
+ "cross_attention_norm": null,
28
+ "down_block_types": [
29
+ "CrossAttnDownBlock2D",
30
+ "CrossAttnDownBlock2D",
31
+ "CrossAttnDownBlock2D",
32
+ "DownBlock2D"
33
+ ],
34
+ "downsample_padding": 1,
35
+ "dropout": 0.0,
36
+ "dual_cross_attention": false,
37
+ "encoder_hid_dim": null,
38
+ "encoder_hid_dim_type": null,
39
+ "flip_sin_to_cos": true,
40
+ "freq_shift": 0,
41
+ "in_channels": 4,
42
+ "layers_per_block": [
43
+ 2,
44
+ 2,
45
+ 2,
46
+ 2
47
+ ],
48
+ "mid_block_only_cross_attention": null,
49
+ "mid_block_scale_factor": 1,
50
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 4,
57
+ "projection_class_embeddings_input_dim": 16,
58
+ "resnet_out_scale_factor": 1.0,
59
+ "resnet_skip_time_act": false,
60
+ "resnet_time_scale_shift": "default",
61
+ "reverse_transformer_layers_per_block": null,
62
+ "sample_size": 32,
63
+ "time_cond_proj_dim": null,
64
+ "time_embedding_act_fn": null,
65
+ "time_embedding_dim": null,
66
+ "time_embedding_type": "positional",
67
+ "timestep_post_act": null,
68
+ "transformer_layers_per_block": 1,
69
+ "up_block_types": [
70
+ "UpBlock2D",
71
+ "CrossAttnUpBlock2D",
72
+ "CrossAttnUpBlock2D",
73
+ "CrossAttnUpBlock2D"
74
+ ],
75
+ "upcast_attention": false,
76
+ "use_linear_projection": true
77
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f798a8be7b7473f6ffff22168e2d99b80ec4afdf339cd273d1b0036b125efae
3
+ size 1735228080
vae/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.29.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 4,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 2,
23
+ "norm_num_groups": 32,
24
+ "out_channels": 3,
25
+ "sample_size": 256,
26
+ "scaling_factor": 0.18215,
27
+ "shift_factor": null,
28
+ "up_block_types": [
29
+ "UpDecoderBlock2D",
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D"
33
+ ],
34
+ "use_post_quant_conv": true,
35
+ "use_quant_conv": true
36
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4c08995484ee61270175e9e7a072b66a6e4eeb5f0c266667fe1f45b90daf9a
3
+ size 167335342