finally, everything works locally
Browse files- README.md +42 -17
- convert_mvdream_to_diffusers.py +2 -2
- mvdream/adaptor.py +0 -28
- mvdream/attention.py +2 -4
- mvdream/models.py +6 -8
- mvdream/pipeline_mvdream.py +19 -22
- requirements.lock.txt +6 -0
- requirements.txt +6 -0
- run_imagedream.py +3 -4
- run_mvdream.py +3 -4
README.md
CHANGED
|
@@ -1,15 +1,27 @@
|
|
| 1 |
-
# MVDream-
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
```bash
|
| 9 |
# dependency
|
| 10 |
-
pip install -
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
cd models
|
| 14 |
wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
|
| 15 |
wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml
|
|
@@ -21,18 +33,31 @@ python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4vi
|
|
| 21 |
|
| 22 |
ImageDream:
|
| 23 |
```bash
|
| 24 |
-
# download original ckpt
|
| 25 |
-
|
| 26 |
-
wget https://
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# convert
|
| 29 |
-
python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4view-ipmv
|
| 30 |
```
|
| 31 |
|
| 32 |
-
###
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
```
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MVDream-diffusers
|
| 2 |
|
| 3 |
+
A **unified** diffusers implementation of [MVDream](https://github.com/bytedance/MVDream) and [ImageDream](https://github.com/bytedance/ImageDream).
|
| 4 |
|
| 5 |
+
We provide converted `fp16` weights on [huggingface](TODO).
|
| 6 |
+
|
| 7 |
+
### Usage
|
| 8 |
|
| 9 |
+
```bash
|
| 10 |
+
python run_mvdream.py "a cute owl"
|
| 11 |
+
python run_imagedream.py data/anya_rgba.png
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
### Install
|
| 15 |
```bash
|
| 16 |
# dependency
|
| 17 |
+
pip install -r requirements.txt
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
### Convert weights
|
| 21 |
|
| 22 |
+
MVDream:
|
| 23 |
+
```bash
|
| 24 |
+
# download original ckpt (we only support the SD 2.1 version)
|
| 25 |
cd models
|
| 26 |
wget https://huggingface.co/MVDream/MVDream/resolve/main/sd-v2.1-base-4view.pt
|
| 27 |
wget https://raw.githubusercontent.com/bytedance/MVDream/main/mvdream/configs/sd-v2-base.yaml
|
|
|
|
| 33 |
|
| 34 |
ImageDream:
|
| 35 |
```bash
|
| 36 |
+
# download original ckpt (we only support the pixel-controller version)
|
| 37 |
+
cd models
|
| 38 |
+
wget https://huggingface.co/Peng-Wang/ImageDream/resolve/main/sd-v2.1-base-4view-ipmv.pt
|
| 39 |
+
wget https://raw.githubusercontent.com/bytedance/ImageDream/main/extern/ImageDream/imagedream/configs/sd_v2_base_ipmv.yaml
|
| 40 |
+
cd ..
|
| 41 |
|
| 42 |
# convert
|
| 43 |
+
python convert_mvdream_to_diffusers.py --checkpoint_path models/sd-v2.1-base-4view-ipmv.pt --dump_path ./weights_imagedream --original_config_file models/sd_v2_base_ipmv.yaml --half --to_safetensors --test
|
| 44 |
```
|
| 45 |
|
| 46 |
+
### Acknowledgement
|
| 47 |
+
|
| 48 |
+
* The original papers:
|
| 49 |
+
```bibtex
|
| 50 |
+
@article{shi2023MVDream,
|
| 51 |
+
author = {Shi, Yichun and Wang, Peng and Ye, Jianglong and Mai, Long and Li, Kejie and Yang, Xiao},
|
| 52 |
+
title = {MVDream: Multi-view Diffusion for 3D Generation},
|
| 53 |
+
journal = {arXiv:2308.16512},
|
| 54 |
+
year = {2023},
|
| 55 |
+
}
|
| 56 |
+
@article{wang2023imagedream,
|
| 57 |
+
title={ImageDream: Image-Prompt Multi-view Diffusion for 3D Generation},
|
| 58 |
+
author={Wang, Peng and Shi, Yichun},
|
| 59 |
+
journal={arXiv preprint arXiv:2312.02201},
|
| 60 |
+
year={2023}
|
| 61 |
+
}
|
| 62 |
+
```
|
| 63 |
+
* This codebase is modified from [mvdream-hf](https://github.com/KokeCacao/mvdream-hf).
|
convert_mvdream_to_diffusers.py
CHANGED
|
@@ -568,7 +568,7 @@ if __name__ == "__main__":
|
|
| 568 |
images = pipe(
|
| 569 |
image=input_image,
|
| 570 |
prompt="",
|
| 571 |
-
negative_prompt="
|
| 572 |
output_type="pil",
|
| 573 |
guidance_scale=5.0,
|
| 574 |
num_inference_steps=50,
|
|
@@ -582,7 +582,7 @@ if __name__ == "__main__":
|
|
| 582 |
images = loaded_pipe(
|
| 583 |
image=input_image,
|
| 584 |
prompt="",
|
| 585 |
-
negative_prompt="
|
| 586 |
output_type="pil",
|
| 587 |
guidance_scale=5.0,
|
| 588 |
num_inference_steps=50,
|
|
|
|
| 568 |
images = pipe(
|
| 569 |
image=input_image,
|
| 570 |
prompt="",
|
| 571 |
+
negative_prompt="",
|
| 572 |
output_type="pil",
|
| 573 |
guidance_scale=5.0,
|
| 574 |
num_inference_steps=50,
|
|
|
|
| 582 |
images = loaded_pipe(
|
| 583 |
image=input_image,
|
| 584 |
prompt="",
|
| 585 |
+
negative_prompt="",
|
| 586 |
output_type="pil",
|
| 587 |
guidance_scale=5.0,
|
| 588 |
num_inference_steps=50,
|
mvdream/adaptor.py
CHANGED
|
@@ -73,34 +73,6 @@ class PerceiverAttention(nn.Module):
|
|
| 73 |
return self.to_out(out)
|
| 74 |
|
| 75 |
|
| 76 |
-
class ImageProjModel(torch.nn.Module):
|
| 77 |
-
"""Projection Model"""
|
| 78 |
-
|
| 79 |
-
def __init__(
|
| 80 |
-
self,
|
| 81 |
-
cross_attention_dim=1024,
|
| 82 |
-
clip_embeddings_dim=1024,
|
| 83 |
-
clip_extra_context_tokens=4,
|
| 84 |
-
):
|
| 85 |
-
super().__init__()
|
| 86 |
-
self.cross_attention_dim = cross_attention_dim
|
| 87 |
-
self.clip_extra_context_tokens = clip_extra_context_tokens
|
| 88 |
-
|
| 89 |
-
# from 1024 -> 4 * 1024
|
| 90 |
-
self.proj = torch.nn.Linear(
|
| 91 |
-
clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
|
| 92 |
-
)
|
| 93 |
-
self.norm = torch.nn.LayerNorm(cross_attention_dim)
|
| 94 |
-
|
| 95 |
-
def forward(self, image_embeds):
|
| 96 |
-
embeds = image_embeds
|
| 97 |
-
clip_extra_context_tokens = self.proj(embeds).reshape(
|
| 98 |
-
-1, self.clip_extra_context_tokens, self.cross_attention_dim
|
| 99 |
-
)
|
| 100 |
-
clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
|
| 101 |
-
return clip_extra_context_tokens
|
| 102 |
-
|
| 103 |
-
|
| 104 |
class Resampler(nn.Module):
|
| 105 |
def __init__(
|
| 106 |
self,
|
|
|
|
| 73 |
return self.to_out(out)
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
class Resampler(nn.Module):
|
| 77 |
def __init__(
|
| 78 |
self,
|
mvdream/attention.py
CHANGED
|
@@ -88,7 +88,7 @@ class MemoryEfficientCrossAttention(nn.Module):
|
|
| 88 |
context = default(context, x)
|
| 89 |
|
| 90 |
if self.ip_dim > 0:
|
| 91 |
-
# context
|
| 92 |
token_len = context.shape[1]
|
| 93 |
context_ip = context[:, -self.ip_dim :, :]
|
| 94 |
k_ip = self.to_k_ip(context_ip)
|
|
@@ -212,9 +212,7 @@ class SpatialTransformer3D(nn.Module):
|
|
| 212 |
self.in_channels = in_channels
|
| 213 |
|
| 214 |
inner_dim = n_heads * d_head
|
| 215 |
-
self.norm = nn.GroupNorm(
|
| 216 |
-
num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
|
| 217 |
-
)
|
| 218 |
self.proj_in = nn.Linear(in_channels, inner_dim)
|
| 219 |
|
| 220 |
self.transformer_blocks = nn.ModuleList(
|
|
|
|
| 88 |
context = default(context, x)
|
| 89 |
|
| 90 |
if self.ip_dim > 0:
|
| 91 |
+
# context: [B, 77 + 16(ip), 1024]
|
| 92 |
token_len = context.shape[1]
|
| 93 |
context_ip = context[:, -self.ip_dim :, :]
|
| 94 |
k_ip = self.to_k_ip(context_ip)
|
|
|
|
| 212 |
self.in_channels = in_channels
|
| 213 |
|
| 214 |
inner_dim = n_heads * d_head
|
| 215 |
+
self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
|
|
|
|
|
|
| 216 |
self.proj_in = nn.Linear(in_channels, inner_dim)
|
| 217 |
|
| 218 |
self.transformer_blocks = nn.ModuleList(
|
mvdream/models.py
CHANGED
|
@@ -14,7 +14,7 @@ from .util import (
|
|
| 14 |
timestep_embedding,
|
| 15 |
)
|
| 16 |
from .attention import SpatialTransformer3D
|
| 17 |
-
from .adaptor import Resampler
|
| 18 |
|
| 19 |
import kiui
|
| 20 |
|
|
@@ -266,15 +266,13 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
|
|
| 266 |
num_heads_upsample=-1,
|
| 267 |
use_scale_shift_norm=False,
|
| 268 |
resblock_updown=False,
|
| 269 |
-
transformer_depth=1,
|
| 270 |
-
context_dim=None,
|
| 271 |
-
n_embed=None,
|
| 272 |
-
disable_self_attentions=None,
|
| 273 |
num_attention_blocks=None,
|
| 274 |
-
disable_middle_self_attn=False,
|
| 275 |
adm_in_channels=None,
|
| 276 |
camera_dim=None,
|
| 277 |
-
ip_dim=0,
|
| 278 |
ip_weight=1.0,
|
| 279 |
**kwargs,
|
| 280 |
):
|
|
@@ -604,7 +602,7 @@ class MultiViewUNetModel(ModelMixin, ConfigMixin):
|
|
| 604 |
|
| 605 |
# imagedream variant
|
| 606 |
if self.ip_dim > 0:
|
| 607 |
-
x[(num_frames - 1) :: num_frames, :, :, :] = ip_img
|
| 608 |
ip_emb = self.image_embed(ip)
|
| 609 |
context = torch.cat((context, ip_emb), 1)
|
| 610 |
|
|
|
|
| 14 |
timestep_embedding,
|
| 15 |
)
|
| 16 |
from .attention import SpatialTransformer3D
|
| 17 |
+
from .adaptor import Resampler
|
| 18 |
|
| 19 |
import kiui
|
| 20 |
|
|
|
|
| 266 |
num_heads_upsample=-1,
|
| 267 |
use_scale_shift_norm=False,
|
| 268 |
resblock_updown=False,
|
| 269 |
+
transformer_depth=1,
|
| 270 |
+
context_dim=None,
|
| 271 |
+
n_embed=None,
|
|
|
|
| 272 |
num_attention_blocks=None,
|
|
|
|
| 273 |
adm_in_channels=None,
|
| 274 |
camera_dim=None,
|
| 275 |
+
ip_dim=0, # imagedream uses ip_dim > 0
|
| 276 |
ip_weight=1.0,
|
| 277 |
**kwargs,
|
| 278 |
):
|
|
|
|
| 602 |
|
| 603 |
# imagedream variant
|
| 604 |
if self.ip_dim > 0:
|
| 605 |
+
x[(num_frames - 1) :: num_frames, :, :, :] = ip_img # place at [4, 9]
|
| 606 |
ip_emb = self.image_embed(ip)
|
| 607 |
context = torch.cat((context, ip_emb), 1)
|
| 608 |
|
mvdream/pipeline_mvdream.py
CHANGED
|
@@ -405,29 +405,27 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 405 |
def encode_image(self, image, device, num_images_per_prompt):
|
| 406 |
dtype = next(self.image_encoder.parameters()).dtype
|
| 407 |
|
| 408 |
-
|
|
|
|
|
|
|
| 409 |
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
| 410 |
-
|
| 411 |
image = image.to(device=device, dtype=dtype)
|
| 412 |
|
| 413 |
-
|
| 414 |
-
|
| 415 |
|
| 416 |
-
|
| 417 |
-
uncond_image_enc_hidden_states = torch.zeros_like(image_enc_hidden_states)
|
| 418 |
-
|
| 419 |
-
return uncond_image_enc_hidden_states, image_enc_hidden_states
|
| 420 |
|
| 421 |
def encode_image_latents(self, image, device, num_images_per_prompt):
|
| 422 |
|
| 423 |
-
image = torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2) # [1, 3, H, W]
|
| 424 |
-
image = image.to(device=device)
|
| 425 |
-
image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False)
|
| 426 |
dtype = next(self.image_encoder.parameters()).dtype
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
image = image.to(dtype=dtype)
|
| 428 |
|
| 429 |
posterior = self.vae.encode(image).latent_dist
|
| 430 |
-
|
| 431 |
latents = posterior.sample() * self.vae.config.scaling_factor # [B, C, H, W]
|
| 432 |
latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
|
| 433 |
|
|
@@ -436,13 +434,13 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 436 |
@torch.no_grad()
|
| 437 |
def __call__(
|
| 438 |
self,
|
| 439 |
-
prompt: str = "
|
| 440 |
image: Optional[np.ndarray] = None,
|
| 441 |
height: int = 256,
|
| 442 |
width: int = 256,
|
| 443 |
num_inference_steps: int = 50,
|
| 444 |
guidance_scale: float = 7.0,
|
| 445 |
-
negative_prompt: str = "
|
| 446 |
num_images_per_prompt: int = 1,
|
| 447 |
eta: float = 0.0,
|
| 448 |
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
|
@@ -454,7 +452,6 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 454 |
):
|
| 455 |
self.unet = self.unet.to(device=device)
|
| 456 |
self.vae = self.vae.to(device=device)
|
| 457 |
-
|
| 458 |
self.text_encoder = self.text_encoder.to(device=device)
|
| 459 |
|
| 460 |
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
|
@@ -466,10 +463,9 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 466 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 467 |
timesteps = self.scheduler.timesteps
|
| 468 |
|
| 469 |
-
# imagedream variant
|
| 470 |
if image is not None:
|
| 471 |
assert isinstance(image, np.ndarray) and image.dtype == np.float32
|
| 472 |
-
|
| 473 |
self.image_encoder = self.image_encoder.to(device=device)
|
| 474 |
image_embeds_neg, image_embeds_pos = self.encode_image(image, device, num_images_per_prompt)
|
| 475 |
image_latents_neg, image_latents_pos = self.encode_image_latents(image, device, num_images_per_prompt)
|
|
@@ -496,7 +492,11 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 496 |
None,
|
| 497 |
)
|
| 498 |
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
# Prepare extra step kwargs.
|
| 502 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
|
@@ -508,10 +508,7 @@ class MVDreamPipeline(DiffusionPipeline):
|
|
| 508 |
# expand the latents if we are doing classifier free guidance
|
| 509 |
multiplier = 2 if do_classifier_free_guidance else 1
|
| 510 |
latent_model_input = torch.cat([latents] * multiplier)
|
| 511 |
-
latent_model_input = self.scheduler.scale_model_input(
|
| 512 |
-
latent_model_input, t
|
| 513 |
-
)
|
| 514 |
-
|
| 515 |
|
| 516 |
unet_inputs = {
|
| 517 |
'x': latent_model_input,
|
|
|
|
| 405 |
def encode_image(self, image, device, num_images_per_prompt):
|
| 406 |
dtype = next(self.image_encoder.parameters()).dtype
|
| 407 |
|
| 408 |
+
if image.dtype == np.float32:
|
| 409 |
+
image = (image * 255).astype(np.uint8)
|
| 410 |
+
|
| 411 |
image = self.feature_extractor(image, return_tensors="pt").pixel_values
|
|
|
|
| 412 |
image = image.to(device=device, dtype=dtype)
|
| 413 |
|
| 414 |
+
image_embeds = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
|
| 415 |
+
image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
|
| 416 |
|
| 417 |
+
return torch.zeros_like(image_embeds), image_embeds
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
def encode_image_latents(self, image, device, num_images_per_prompt):
|
| 420 |
|
|
|
|
|
|
|
|
|
|
| 421 |
dtype = next(self.image_encoder.parameters()).dtype
|
| 422 |
+
|
| 423 |
+
image = torch.from_numpy(image).unsqueeze(0).permute(0, 3, 1, 2).to(device=device) # [1, 3, H, W]
|
| 424 |
+
image = 2 * image - 1
|
| 425 |
+
image = F.interpolate(image, (256, 256), mode='bilinear', align_corners=False)
|
| 426 |
image = image.to(dtype=dtype)
|
| 427 |
|
| 428 |
posterior = self.vae.encode(image).latent_dist
|
|
|
|
| 429 |
latents = posterior.sample() * self.vae.config.scaling_factor # [B, C, H, W]
|
| 430 |
latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
|
| 431 |
|
|
|
|
| 434 |
@torch.no_grad()
|
| 435 |
def __call__(
|
| 436 |
self,
|
| 437 |
+
prompt: str = "",
|
| 438 |
image: Optional[np.ndarray] = None,
|
| 439 |
height: int = 256,
|
| 440 |
width: int = 256,
|
| 441 |
num_inference_steps: int = 50,
|
| 442 |
guidance_scale: float = 7.0,
|
| 443 |
+
negative_prompt: str = "",
|
| 444 |
num_images_per_prompt: int = 1,
|
| 445 |
eta: float = 0.0,
|
| 446 |
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
|
|
|
| 452 |
):
|
| 453 |
self.unet = self.unet.to(device=device)
|
| 454 |
self.vae = self.vae.to(device=device)
|
|
|
|
| 455 |
self.text_encoder = self.text_encoder.to(device=device)
|
| 456 |
|
| 457 |
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
|
|
|
| 463 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 464 |
timesteps = self.scheduler.timesteps
|
| 465 |
|
| 466 |
+
# imagedream variant
|
| 467 |
if image is not None:
|
| 468 |
assert isinstance(image, np.ndarray) and image.dtype == np.float32
|
|
|
|
| 469 |
self.image_encoder = self.image_encoder.to(device=device)
|
| 470 |
image_embeds_neg, image_embeds_pos = self.encode_image(image, device, num_images_per_prompt)
|
| 471 |
image_latents_neg, image_latents_pos = self.encode_image_latents(image, device, num_images_per_prompt)
|
|
|
|
| 492 |
None,
|
| 493 |
)
|
| 494 |
|
| 495 |
+
if image is not None:
|
| 496 |
+
camera = get_camera(num_frames, elevation=5, extra_view=True).to(dtype=latents.dtype, device=device)
|
| 497 |
+
else:
|
| 498 |
+
camera = get_camera(num_frames, elevation=15, extra_view=False).to(dtype=latents.dtype, device=device)
|
| 499 |
+
camera = camera.repeat_interleave(num_images_per_prompt, dim=0)
|
| 500 |
|
| 501 |
# Prepare extra step kwargs.
|
| 502 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
|
|
|
| 508 |
# expand the latents if we are doing classifier free guidance
|
| 509 |
multiplier = 2 if do_classifier_free_guidance else 1
|
| 510 |
latent_model_input = torch.cat([latents] * multiplier)
|
| 511 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
|
|
|
|
|
|
|
|
|
| 512 |
|
| 513 |
unet_inputs = {
|
| 514 |
'x': latent_model_input,
|
requirements.lock.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
omegaconf == 2.3.0
|
| 2 |
+
diffusers == 0.23.1
|
| 3 |
+
safetensors == 0.4.1
|
| 4 |
+
huggingface_hub == 0.19.4
|
| 5 |
+
transformers == 4.35.2
|
| 6 |
+
accelerate == 0.25.0.dev0
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
omegaconf
|
| 2 |
+
diffusers
|
| 3 |
+
safetensors
|
| 4 |
+
huggingface_hub
|
| 5 |
+
transformers
|
| 6 |
+
accelerate
|
run_imagedream.py
CHANGED
|
@@ -17,9 +17,9 @@ parser.add_argument("image", type=str, default='data/anya_rgba.png')
|
|
| 17 |
parser.add_argument("--prompt", type=str, default="")
|
| 18 |
args = parser.parse_args()
|
| 19 |
|
| 20 |
-
|
| 21 |
input_image = kiui.read_image(args.image, mode='float')
|
| 22 |
-
image = pipe(args.prompt, input_image)
|
| 23 |
grid = np.concatenate(
|
| 24 |
[
|
| 25 |
np.concatenate([image[0], image[2]], axis=0),
|
|
@@ -28,5 +28,4 @@ while True:
|
|
| 28 |
axis=1,
|
| 29 |
)
|
| 30 |
# kiui.vis.plot_image(grid)
|
| 31 |
-
kiui.write_image('
|
| 32 |
-
break
|
|
|
|
| 17 |
parser.add_argument("--prompt", type=str, default="")
|
| 18 |
args = parser.parse_args()
|
| 19 |
|
| 20 |
+
for i in range(5):
|
| 21 |
input_image = kiui.read_image(args.image, mode='float')
|
| 22 |
+
image = pipe(args.prompt, input_image, guidance_scale=5)
|
| 23 |
grid = np.concatenate(
|
| 24 |
[
|
| 25 |
np.concatenate([image[0], image[2]], axis=0),
|
|
|
|
| 28 |
axis=1,
|
| 29 |
)
|
| 30 |
# kiui.vis.plot_image(grid)
|
| 31 |
+
kiui.write_image(f'test_imagedream_{i}.jpg', grid)
|
|
|
run_mvdream.py
CHANGED
|
@@ -5,7 +5,7 @@ import argparse
|
|
| 5 |
from mvdream.pipeline_mvdream import MVDreamPipeline
|
| 6 |
|
| 7 |
pipe = MVDreamPipeline.from_pretrained(
|
| 8 |
-
"./
|
| 9 |
# "ashawkey/mvdream-sd2.1-diffusers",
|
| 10 |
torch_dtype=torch.float16
|
| 11 |
)
|
|
@@ -16,7 +16,7 @@ parser = argparse.ArgumentParser(description="MVDream")
|
|
| 16 |
parser.add_argument("prompt", type=str, default="a cute owl 3d model")
|
| 17 |
args = parser.parse_args()
|
| 18 |
|
| 19 |
-
|
| 20 |
image = pipe(args.prompt)
|
| 21 |
grid = np.concatenate(
|
| 22 |
[
|
|
@@ -26,5 +26,4 @@ while True:
|
|
| 26 |
axis=1,
|
| 27 |
)
|
| 28 |
# kiui.vis.plot_image(grid)
|
| 29 |
-
kiui.write_image('
|
| 30 |
-
break
|
|
|
|
| 5 |
from mvdream.pipeline_mvdream import MVDreamPipeline
|
| 6 |
|
| 7 |
pipe = MVDreamPipeline.from_pretrained(
|
| 8 |
+
"./weights_mvdream", # local weights
|
| 9 |
# "ashawkey/mvdream-sd2.1-diffusers",
|
| 10 |
torch_dtype=torch.float16
|
| 11 |
)
|
|
|
|
| 16 |
parser.add_argument("prompt", type=str, default="a cute owl 3d model")
|
| 17 |
args = parser.parse_args()
|
| 18 |
|
| 19 |
+
for i in range(5):
|
| 20 |
image = pipe(args.prompt)
|
| 21 |
grid = np.concatenate(
|
| 22 |
[
|
|
|
|
| 26 |
axis=1,
|
| 27 |
)
|
| 28 |
# kiui.vis.plot_image(grid)
|
| 29 |
+
kiui.write_image(f'test_mvdream_{i}.jpg', grid)
|
|
|