Spaces:
Runtime error
Runtime error
Commit
Β·
79cc00b
1
Parent(s):
b530233
update
Browse files- hy3dgen/shapegen/__init__.py +1 -1
- hy3dgen/shapegen/models/__init__.py +1 -1
- hy3dgen/shapegen/models/conditioner.py +104 -12
- hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py +12 -3
- hy3dgen/shapegen/pipelines.py +181 -65
- hy3dgen/shapegen/postprocessors.py +4 -1
- hy3dgen/shapegen/preprocessors.py +55 -6
hy3dgen/shapegen/__init__.py
CHANGED
|
@@ -13,5 +13,5 @@
|
|
| 13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 14 |
|
| 15 |
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
| 16 |
-
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover
|
| 17 |
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
|
|
|
| 13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 14 |
|
| 15 |
from .pipelines import Hunyuan3DDiTPipeline, Hunyuan3DDiTFlowMatchingPipeline
|
| 16 |
+
from .postprocessors import FaceReducer, FloaterRemover, DegenerateFaceRemover, MeshSimplifier
|
| 17 |
from .preprocessors import ImageProcessorV2, IMAGE_PROCESSORS, DEFAULT_IMAGEPROCESSOR
|
hy3dgen/shapegen/models/__init__.py
CHANGED
|
@@ -25,4 +25,4 @@
|
|
| 25 |
|
| 26 |
from .autoencoders import ShapeVAE
|
| 27 |
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
| 28 |
-
from .denoisers import
|
|
|
|
| 25 |
|
| 26 |
from .autoencoders import ShapeVAE
|
| 27 |
from .conditioner import DualImageEncoder, SingleImageEncoder, DinoImageEncoder, CLIPImageEncoder
|
| 28 |
+
from .denoisers import Hunyuan3DDiT
|
hy3dgen/shapegen/models/conditioner.py
CHANGED
|
@@ -22,6 +22,7 @@
|
|
| 22 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
| 23 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 24 |
|
|
|
|
| 25 |
import torch
|
| 26 |
import torch.nn as nn
|
| 27 |
from torchvision import transforms
|
|
@@ -33,6 +34,26 @@ from transformers import (
|
|
| 33 |
)
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
class ImageEncoder(nn.Module):
|
| 37 |
def __init__(
|
| 38 |
self,
|
|
@@ -67,7 +88,7 @@ class ImageEncoder(nn.Module):
|
|
| 67 |
]
|
| 68 |
)
|
| 69 |
|
| 70 |
-
def forward(self, image, mask=None, value_range=(-1, 1)):
|
| 71 |
if value_range is not None:
|
| 72 |
low, high = value_range
|
| 73 |
image = (image - low) / (high - low)
|
|
@@ -82,7 +103,7 @@ class ImageEncoder(nn.Module):
|
|
| 82 |
|
| 83 |
return last_hidden_state
|
| 84 |
|
| 85 |
-
def unconditional_embedding(self, batch_size):
|
| 86 |
device = next(self.model.parameters()).device
|
| 87 |
dtype = next(self.model.parameters()).dtype
|
| 88 |
zero = torch.zeros(
|
|
@@ -110,11 +131,82 @@ class DinoImageEncoder(ImageEncoder):
|
|
| 110 |
std = [0.229, 0.224, 0.225]
|
| 111 |
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def build_image_encoder(config):
|
| 114 |
if config['type'] == 'CLIPImageEncoder':
|
| 115 |
return CLIPImageEncoder(**config['kwargs'])
|
| 116 |
elif config['type'] == 'DinoImageEncoder':
|
| 117 |
return DinoImageEncoder(**config['kwargs'])
|
|
|
|
|
|
|
| 118 |
else:
|
| 119 |
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
| 120 |
|
|
@@ -129,17 +221,17 @@ class DualImageEncoder(nn.Module):
|
|
| 129 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
| 130 |
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
| 131 |
|
| 132 |
-
def forward(self, image, mask=None):
|
| 133 |
outputs = {
|
| 134 |
-
'main': self.main_image_encoder(image, mask=mask),
|
| 135 |
-
'additional': self.additional_image_encoder(image, mask=mask),
|
| 136 |
}
|
| 137 |
return outputs
|
| 138 |
|
| 139 |
-
def unconditional_embedding(self, batch_size):
|
| 140 |
outputs = {
|
| 141 |
-
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
| 142 |
-
'additional': self.additional_image_encoder.unconditional_embedding(batch_size),
|
| 143 |
}
|
| 144 |
return outputs
|
| 145 |
|
|
@@ -152,14 +244,14 @@ class SingleImageEncoder(nn.Module):
|
|
| 152 |
super().__init__()
|
| 153 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
| 154 |
|
| 155 |
-
def forward(self, image, mask=None):
|
| 156 |
outputs = {
|
| 157 |
-
'main': self.main_image_encoder(image, mask=mask),
|
| 158 |
}
|
| 159 |
return outputs
|
| 160 |
|
| 161 |
-
def unconditional_embedding(self, batch_size):
|
| 162 |
outputs = {
|
| 163 |
-
'main': self.main_image_encoder.unconditional_embedding(batch_size),
|
| 164 |
}
|
| 165 |
return outputs
|
|
|
|
| 22 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
| 23 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 24 |
|
| 25 |
+
import numpy as np
|
| 26 |
import torch
|
| 27 |
import torch.nn as nn
|
| 28 |
from torchvision import transforms
|
|
|
|
| 34 |
)
|
| 35 |
|
| 36 |
|
| 37 |
+
def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
|
| 38 |
+
"""
|
| 39 |
+
embed_dim: output dimension for each position
|
| 40 |
+
pos: a list of positions to be encoded: size (M,)
|
| 41 |
+
out: (M, D)
|
| 42 |
+
"""
|
| 43 |
+
assert embed_dim % 2 == 0
|
| 44 |
+
omega = np.arange(embed_dim // 2, dtype=np.float64)
|
| 45 |
+
omega /= embed_dim / 2.
|
| 46 |
+
omega = 1. / 10000 ** omega # (D/2,)
|
| 47 |
+
|
| 48 |
+
pos = pos.reshape(-1) # (M,)
|
| 49 |
+
out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
|
| 50 |
+
|
| 51 |
+
emb_sin = np.sin(out) # (M, D/2)
|
| 52 |
+
emb_cos = np.cos(out) # (M, D/2)
|
| 53 |
+
|
| 54 |
+
return np.concatenate([emb_sin, emb_cos], axis=1)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
class ImageEncoder(nn.Module):
|
| 58 |
def __init__(
|
| 59 |
self,
|
|
|
|
| 88 |
]
|
| 89 |
)
|
| 90 |
|
| 91 |
+
def forward(self, image, mask=None, value_range=(-1, 1), **kwargs):
|
| 92 |
if value_range is not None:
|
| 93 |
low, high = value_range
|
| 94 |
image = (image - low) / (high - low)
|
|
|
|
| 103 |
|
| 104 |
return last_hidden_state
|
| 105 |
|
| 106 |
+
def unconditional_embedding(self, batch_size, **kwargs):
|
| 107 |
device = next(self.model.parameters()).device
|
| 108 |
dtype = next(self.model.parameters()).dtype
|
| 109 |
zero = torch.zeros(
|
|
|
|
| 131 |
std = [0.229, 0.224, 0.225]
|
| 132 |
|
| 133 |
|
| 134 |
+
class DinoImageEncoderMV(DinoImageEncoder):
|
| 135 |
+
def __init__(
|
| 136 |
+
self,
|
| 137 |
+
version=None,
|
| 138 |
+
config=None,
|
| 139 |
+
use_cls_token=True,
|
| 140 |
+
image_size=224,
|
| 141 |
+
view_num=4,
|
| 142 |
+
**kwargs,
|
| 143 |
+
):
|
| 144 |
+
super().__init__(version, config, use_cls_token, image_size, **kwargs)
|
| 145 |
+
self.view_num = view_num
|
| 146 |
+
self.num_patches = self.num_patches
|
| 147 |
+
pos = np.arange(self.view_num, dtype=np.float32)
|
| 148 |
+
view_embedding = torch.from_numpy(
|
| 149 |
+
get_1d_sincos_pos_embed_from_grid(self.model.config.hidden_size, pos)).float()
|
| 150 |
+
|
| 151 |
+
view_embedding = view_embedding.unsqueeze(1).repeat(1, self.num_patches, 1)
|
| 152 |
+
self.view_embed = view_embedding.unsqueeze(0)
|
| 153 |
+
|
| 154 |
+
def forward(self, image, mask=None, value_range=(-1, 1), view_idxs=None):
|
| 155 |
+
if value_range is not None:
|
| 156 |
+
low, high = value_range
|
| 157 |
+
image = (image - low) / (high - low)
|
| 158 |
+
|
| 159 |
+
image = image.to(self.model.device, dtype=self.model.dtype)
|
| 160 |
+
|
| 161 |
+
bs, num_views, c, h, w = image.shape
|
| 162 |
+
image = image.view(bs * num_views, c, h, w)
|
| 163 |
+
|
| 164 |
+
inputs = self.transform(image)
|
| 165 |
+
outputs = self.model(inputs)
|
| 166 |
+
|
| 167 |
+
last_hidden_state = outputs.last_hidden_state
|
| 168 |
+
last_hidden_state = last_hidden_state.view(
|
| 169 |
+
bs, num_views, last_hidden_state.shape[-2],
|
| 170 |
+
last_hidden_state.shape[-1]
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
view_embedding = self.view_embed.to(last_hidden_state.dtype).to(last_hidden_state.device)
|
| 174 |
+
if view_idxs is not None:
|
| 175 |
+
assert len(view_idxs) == bs
|
| 176 |
+
view_embeddings = []
|
| 177 |
+
for i in range(bs):
|
| 178 |
+
view_idx = view_idxs[i]
|
| 179 |
+
assert num_views == len(view_idx)
|
| 180 |
+
view_embeddings.append(self.view_embed[:, view_idx, ...])
|
| 181 |
+
view_embedding = torch.cat(view_embeddings, 0).to(last_hidden_state.dtype).to(last_hidden_state.device)
|
| 182 |
+
|
| 183 |
+
if num_views != self.view_num:
|
| 184 |
+
view_embedding = view_embedding[:, :num_views, ...]
|
| 185 |
+
last_hidden_state = last_hidden_state + view_embedding
|
| 186 |
+
last_hidden_state = last_hidden_state.view(bs, num_views * last_hidden_state.shape[-2],
|
| 187 |
+
last_hidden_state.shape[-1])
|
| 188 |
+
return last_hidden_state
|
| 189 |
+
|
| 190 |
+
def unconditional_embedding(self, batch_size, view_idxs=None, **kwargs):
|
| 191 |
+
device = next(self.model.parameters()).device
|
| 192 |
+
dtype = next(self.model.parameters()).dtype
|
| 193 |
+
zero = torch.zeros(
|
| 194 |
+
batch_size,
|
| 195 |
+
self.num_patches * len(view_idxs[0]),
|
| 196 |
+
self.model.config.hidden_size,
|
| 197 |
+
device=device,
|
| 198 |
+
dtype=dtype,
|
| 199 |
+
)
|
| 200 |
+
return zero
|
| 201 |
+
|
| 202 |
+
|
| 203 |
def build_image_encoder(config):
|
| 204 |
if config['type'] == 'CLIPImageEncoder':
|
| 205 |
return CLIPImageEncoder(**config['kwargs'])
|
| 206 |
elif config['type'] == 'DinoImageEncoder':
|
| 207 |
return DinoImageEncoder(**config['kwargs'])
|
| 208 |
+
elif config['type'] == 'DinoImageEncoderMV':
|
| 209 |
+
return DinoImageEncoderMV(**config['kwargs'])
|
| 210 |
else:
|
| 211 |
raise ValueError(f'Unknown image encoder type: {config["type"]}')
|
| 212 |
|
|
|
|
| 221 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
| 222 |
self.additional_image_encoder = build_image_encoder(additional_image_encoder)
|
| 223 |
|
| 224 |
+
def forward(self, image, mask=None, **kwargs):
|
| 225 |
outputs = {
|
| 226 |
+
'main': self.main_image_encoder(image, mask=mask, **kwargs),
|
| 227 |
+
'additional': self.additional_image_encoder(image, mask=mask, **kwargs),
|
| 228 |
}
|
| 229 |
return outputs
|
| 230 |
|
| 231 |
+
def unconditional_embedding(self, batch_size, **kwargs):
|
| 232 |
outputs = {
|
| 233 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
|
| 234 |
+
'additional': self.additional_image_encoder.unconditional_embedding(batch_size, **kwargs),
|
| 235 |
}
|
| 236 |
return outputs
|
| 237 |
|
|
|
|
| 244 |
super().__init__()
|
| 245 |
self.main_image_encoder = build_image_encoder(main_image_encoder)
|
| 246 |
|
| 247 |
+
def forward(self, image, mask=None, **kwargs):
|
| 248 |
outputs = {
|
| 249 |
+
'main': self.main_image_encoder(image, mask=mask, **kwargs),
|
| 250 |
}
|
| 251 |
return outputs
|
| 252 |
|
| 253 |
+
def unconditional_embedding(self, batch_size, **kwargs):
|
| 254 |
outputs = {
|
| 255 |
+
'main': self.main_image_encoder.unconditional_embedding(batch_size, **kwargs),
|
| 256 |
}
|
| 257 |
return outputs
|
hy3dgen/shapegen/models/denoisers/hunyuan3ddit.py
CHANGED
|
@@ -60,6 +60,15 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
|
|
| 60 |
return embedding
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
class MLPEmbedder(nn.Module):
|
| 64 |
def __init__(self, in_dim: int, hidden_dim: int):
|
| 65 |
super().__init__()
|
|
@@ -162,7 +171,7 @@ class DoubleStreamBlock(nn.Module):
|
|
| 162 |
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 163 |
self.img_mlp = nn.Sequential(
|
| 164 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
| 165 |
-
|
| 166 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
| 167 |
)
|
| 168 |
|
|
@@ -173,7 +182,7 @@ class DoubleStreamBlock(nn.Module):
|
|
| 173 |
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 174 |
self.txt_mlp = nn.Sequential(
|
| 175 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
| 176 |
-
|
| 177 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
| 178 |
)
|
| 179 |
|
|
@@ -239,7 +248,7 @@ class SingleStreamBlock(nn.Module):
|
|
| 239 |
self.hidden_size = hidden_size
|
| 240 |
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 241 |
|
| 242 |
-
self.mlp_act =
|
| 243 |
self.modulation = Modulation(hidden_size, double=False)
|
| 244 |
|
| 245 |
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
|
|
|
| 60 |
return embedding
|
| 61 |
|
| 62 |
|
| 63 |
+
class GELU(nn.Module):
|
| 64 |
+
def __init__(self, approximate='tanh'):
|
| 65 |
+
super().__init__()
|
| 66 |
+
self.approximate = approximate
|
| 67 |
+
|
| 68 |
+
def forward(self, x: Tensor) -> Tensor:
|
| 69 |
+
return nn.functional.gelu(x.contiguous(), approximate=self.approximate)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
class MLPEmbedder(nn.Module):
|
| 73 |
def __init__(self, in_dim: int, hidden_dim: int):
|
| 74 |
super().__init__()
|
|
|
|
| 171 |
self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 172 |
self.img_mlp = nn.Sequential(
|
| 173 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
| 174 |
+
GELU(approximate="tanh"),
|
| 175 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
| 176 |
)
|
| 177 |
|
|
|
|
| 182 |
self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 183 |
self.txt_mlp = nn.Sequential(
|
| 184 |
nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
|
| 185 |
+
GELU(approximate="tanh"),
|
| 186 |
nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
|
| 187 |
)
|
| 188 |
|
|
|
|
| 248 |
self.hidden_size = hidden_size
|
| 249 |
self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
|
| 250 |
|
| 251 |
+
self.mlp_act = GELU(approximate="tanh")
|
| 252 |
self.modulation = Modulation(hidden_size, double=False)
|
| 253 |
|
| 254 |
def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
|
hy3dgen/shapegen/pipelines.py
CHANGED
|
@@ -24,11 +24,12 @@ import trimesh
|
|
| 24 |
import yaml
|
| 25 |
from PIL import Image
|
| 26 |
from diffusers.utils.torch_utils import randn_tensor
|
|
|
|
| 27 |
from tqdm import tqdm
|
| 28 |
|
| 29 |
from .models.autoencoders import ShapeVAE
|
| 30 |
from .models.autoencoders import SurfaceExtractors
|
| 31 |
-
from .utils import logger, synchronize_timer
|
| 32 |
|
| 33 |
|
| 34 |
def retrieve_timesteps(
|
|
@@ -127,6 +128,9 @@ def instantiate_from_config(config, **kwargs):
|
|
| 127 |
|
| 128 |
|
| 129 |
class Hunyuan3DDiTPipeline:
|
|
|
|
|
|
|
|
|
|
| 130 |
@classmethod
|
| 131 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
| 132 |
def from_single_file(
|
|
@@ -207,34 +211,12 @@ class Hunyuan3DDiTPipeline:
|
|
| 207 |
dtype=dtype,
|
| 208 |
device=device,
|
| 209 |
)
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
logger.info('Model path not exists, try to download from huggingface')
|
| 217 |
-
try:
|
| 218 |
-
import huggingface_hub
|
| 219 |
-
# download from huggingface
|
| 220 |
-
path = huggingface_hub.snapshot_download(repo_id=original_model_path)
|
| 221 |
-
model_path = os.path.join(path, subfolder)
|
| 222 |
-
except ImportError:
|
| 223 |
-
logger.warning(
|
| 224 |
-
"You need to install HuggingFace Hub to load models from the hub."
|
| 225 |
-
)
|
| 226 |
-
raise RuntimeError(f"Model path {model_path} not found")
|
| 227 |
-
except Exception as e:
|
| 228 |
-
raise e
|
| 229 |
-
|
| 230 |
-
if not os.path.exists(model_path):
|
| 231 |
-
raise FileNotFoundError(f"Model path {original_model_path} not found")
|
| 232 |
-
|
| 233 |
-
extension = 'ckpt' if not use_safetensors else 'safetensors'
|
| 234 |
-
variant = '' if variant is None else f'.{variant}'
|
| 235 |
-
ckpt_name = f'model{variant}.{extension}'
|
| 236 |
-
config_path = os.path.join(model_path, 'config.yaml')
|
| 237 |
-
ckpt_path = os.path.join(model_path, ckpt_name)
|
| 238 |
return cls.from_single_file(
|
| 239 |
ckpt_path,
|
| 240 |
config_path,
|
|
@@ -279,12 +261,18 @@ class Hunyuan3DDiTPipeline:
|
|
| 279 |
if enabled:
|
| 280 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
| 281 |
turbo_vae_mapping = {
|
| 282 |
-
'Hunyuan3D-2': 'hunyuan3d-vae-v2-0-turbo',
|
| 283 |
-
'Hunyuan3D-
|
|
|
|
| 284 |
}
|
| 285 |
model_name = model_path.split('/')[-1]
|
| 286 |
if replace_vae and model_name in turbo_vae_mapping:
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
self.vae.enable_flashvdm_decoder(
|
| 289 |
enabled=enabled,
|
| 290 |
adaptive_kv_selection=adaptive_kv_selection,
|
|
@@ -294,33 +282,146 @@ class Hunyuan3DDiTPipeline:
|
|
| 294 |
else:
|
| 295 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
| 296 |
vae_mapping = {
|
| 297 |
-
'Hunyuan3D-2': 'hunyuan3d-vae-v2-0',
|
| 298 |
-
'Hunyuan3D-
|
|
|
|
| 299 |
}
|
| 300 |
model_name = model_path.split('/')[-1]
|
| 301 |
if model_name in vae_mapping:
|
| 302 |
-
|
|
|
|
| 303 |
self.vae.enable_flashvdm_decoder(enabled=False)
|
| 304 |
|
| 305 |
def to(self, device=None, dtype=None):
|
| 306 |
-
if device is not None:
|
| 307 |
-
self.device = torch.device(device)
|
| 308 |
-
self.vae.to(device)
|
| 309 |
-
self.model.to(device)
|
| 310 |
-
self.conditioner.to(device)
|
| 311 |
if dtype is not None:
|
| 312 |
self.dtype = dtype
|
| 313 |
self.vae.to(dtype=dtype)
|
| 314 |
self.model.to(dtype=dtype)
|
| 315 |
self.conditioner.to(dtype=dtype)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
@synchronize_timer('Encode cond')
|
| 318 |
-
def encode_cond(self, image,
|
| 319 |
bsz = image.shape[0]
|
| 320 |
-
cond = self.conditioner(image=image,
|
| 321 |
|
| 322 |
if do_classifier_free_guidance:
|
| 323 |
-
un_cond = self.conditioner.unconditional_embedding(bsz)
|
| 324 |
|
| 325 |
if dual_guidance:
|
| 326 |
un_cond_drop_main = copy.deepcopy(un_cond)
|
|
@@ -336,8 +437,6 @@ class Hunyuan3DDiTPipeline:
|
|
| 336 |
|
| 337 |
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
| 338 |
else:
|
| 339 |
-
un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
|
| 340 |
-
|
| 341 |
def cat_recursive(a, b):
|
| 342 |
if isinstance(a, torch.Tensor):
|
| 343 |
return torch.cat([a, b], dim=0).to(self.dtype)
|
|
@@ -383,25 +482,27 @@ class Hunyuan3DDiTPipeline:
|
|
| 383 |
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
| 384 |
return latents
|
| 385 |
|
| 386 |
-
def prepare_image(self, image):
|
| 387 |
if isinstance(image, str) and not os.path.exists(image):
|
| 388 |
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
| 389 |
|
| 390 |
if not isinstance(image, list):
|
| 391 |
image = [image]
|
| 392 |
-
|
| 393 |
-
|
| 394 |
for img in image:
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
mask_pts.append(mask_pt)
|
| 398 |
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
| 407 |
"""
|
|
@@ -474,10 +575,14 @@ class Hunyuan3DDiTPipeline:
|
|
| 474 |
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
| 475 |
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
| 476 |
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
batch_size = image.shape[0]
|
| 482 |
|
| 483 |
t_dtype = torch.long
|
|
@@ -535,7 +640,17 @@ class Hunyuan3DDiTPipeline:
|
|
| 535 |
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
| 536 |
)
|
| 537 |
|
| 538 |
-
def _export(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
if not output_type == "latent":
|
| 540 |
latents = 1. / self.vae.scale_factor * latents
|
| 541 |
latents = self.vae(latents)
|
|
@@ -562,7 +677,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
|
| 562 |
@torch.inference_mode()
|
| 563 |
def __call__(
|
| 564 |
self,
|
| 565 |
-
image: Union[str, List[str], Image.Image] = None,
|
| 566 |
num_inference_steps: int = 50,
|
| 567 |
timesteps: List[int] = None,
|
| 568 |
sigmas: List[float] = None,
|
|
@@ -590,10 +705,11 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
|
| 590 |
self.model.guidance_embed is True
|
| 591 |
)
|
| 592 |
|
| 593 |
-
|
|
|
|
| 594 |
cond = self.encode_cond(
|
| 595 |
image=image,
|
| 596 |
-
|
| 597 |
do_classifier_free_guidance=do_classifier_free_guidance,
|
| 598 |
dual_guidance=False,
|
| 599 |
)
|
|
|
|
| 24 |
import yaml
|
| 25 |
from PIL import Image
|
| 26 |
from diffusers.utils.torch_utils import randn_tensor
|
| 27 |
+
from diffusers.utils.import_utils import is_accelerate_version, is_accelerate_available
|
| 28 |
from tqdm import tqdm
|
| 29 |
|
| 30 |
from .models.autoencoders import ShapeVAE
|
| 31 |
from .models.autoencoders import SurfaceExtractors
|
| 32 |
+
from .utils import logger, synchronize_timer, smart_load_model
|
| 33 |
|
| 34 |
|
| 35 |
def retrieve_timesteps(
|
|
|
|
| 128 |
|
| 129 |
|
| 130 |
class Hunyuan3DDiTPipeline:
|
| 131 |
+
model_cpu_offload_seq = "conditioner->model->vae"
|
| 132 |
+
_exclude_from_cpu_offload = []
|
| 133 |
+
|
| 134 |
@classmethod
|
| 135 |
@synchronize_timer('Hunyuan3DDiTPipeline Model Loading')
|
| 136 |
def from_single_file(
|
|
|
|
| 211 |
dtype=dtype,
|
| 212 |
device=device,
|
| 213 |
)
|
| 214 |
+
config_path, ckpt_path = smart_load_model(
|
| 215 |
+
model_path,
|
| 216 |
+
subfolder=subfolder,
|
| 217 |
+
use_safetensors=use_safetensors,
|
| 218 |
+
variant=variant
|
| 219 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
return cls.from_single_file(
|
| 221 |
ckpt_path,
|
| 222 |
config_path,
|
|
|
|
| 261 |
if enabled:
|
| 262 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
| 263 |
turbo_vae_mapping = {
|
| 264 |
+
'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
|
| 265 |
+
'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0-turbo'),
|
| 266 |
+
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini-turbo'),
|
| 267 |
}
|
| 268 |
model_name = model_path.split('/')[-1]
|
| 269 |
if replace_vae and model_name in turbo_vae_mapping:
|
| 270 |
+
model_path, subfolder = turbo_vae_mapping[model_name]
|
| 271 |
+
self.vae = ShapeVAE.from_pretrained(
|
| 272 |
+
model_path, subfolder=subfolder,
|
| 273 |
+
use_safetensors=self.kwargs['from_pretrained_kwargs']['use_safetensors'],
|
| 274 |
+
device=self.device,
|
| 275 |
+
)
|
| 276 |
self.vae.enable_flashvdm_decoder(
|
| 277 |
enabled=enabled,
|
| 278 |
adaptive_kv_selection=adaptive_kv_selection,
|
|
|
|
| 282 |
else:
|
| 283 |
model_path = self.kwargs['from_pretrained_kwargs']['model_path']
|
| 284 |
vae_mapping = {
|
| 285 |
+
'Hunyuan3D-2': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
|
| 286 |
+
'Hunyuan3D-2mv': ('tencent/Hunyuan3D-2', 'hunyuan3d-vae-v2-0'),
|
| 287 |
+
'Hunyuan3D-2mini': ('tencent/Hunyuan3D-2mini', 'hunyuan3d-vae-v2-mini'),
|
| 288 |
}
|
| 289 |
model_name = model_path.split('/')[-1]
|
| 290 |
if model_name in vae_mapping:
|
| 291 |
+
model_path, subfolder = vae_mapping[model_name]
|
| 292 |
+
self.vae = ShapeVAE.from_pretrained(model_path, subfolder=subfolder)
|
| 293 |
self.vae.enable_flashvdm_decoder(enabled=False)
|
| 294 |
|
| 295 |
def to(self, device=None, dtype=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
if dtype is not None:
|
| 297 |
self.dtype = dtype
|
| 298 |
self.vae.to(dtype=dtype)
|
| 299 |
self.model.to(dtype=dtype)
|
| 300 |
self.conditioner.to(dtype=dtype)
|
| 301 |
+
if device is not None:
|
| 302 |
+
self.device = torch.device(device)
|
| 303 |
+
self.vae.to(device)
|
| 304 |
+
self.model.to(device)
|
| 305 |
+
self.conditioner.to(device)
|
| 306 |
+
|
| 307 |
+
@property
|
| 308 |
+
def _execution_device(self):
|
| 309 |
+
r"""
|
| 310 |
+
Returns the device on which the pipeline's models will be executed. After calling
|
| 311 |
+
[`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
|
| 312 |
+
Accelerate's module hooks.
|
| 313 |
+
"""
|
| 314 |
+
for name, model in self.components.items():
|
| 315 |
+
if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
if not hasattr(model, "_hf_hook"):
|
| 319 |
+
return self.device
|
| 320 |
+
for module in model.modules():
|
| 321 |
+
if (
|
| 322 |
+
hasattr(module, "_hf_hook")
|
| 323 |
+
and hasattr(module._hf_hook, "execution_device")
|
| 324 |
+
and module._hf_hook.execution_device is not None
|
| 325 |
+
):
|
| 326 |
+
return torch.device(module._hf_hook.execution_device)
|
| 327 |
+
return self.device
|
| 328 |
+
|
| 329 |
+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
|
| 330 |
+
r"""
|
| 331 |
+
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
|
| 332 |
+
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
|
| 333 |
+
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
|
| 334 |
+
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
|
| 335 |
+
|
| 336 |
+
Arguments:
|
| 337 |
+
gpu_id (`int`, *optional*):
|
| 338 |
+
The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
|
| 339 |
+
device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
|
| 340 |
+
The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
|
| 341 |
+
default to "cuda".
|
| 342 |
+
"""
|
| 343 |
+
if self.model_cpu_offload_seq is None:
|
| 344 |
+
raise ValueError(
|
| 345 |
+
"Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
|
| 346 |
+
)
|
| 347 |
+
|
| 348 |
+
if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
|
| 349 |
+
from accelerate import cpu_offload_with_hook
|
| 350 |
+
else:
|
| 351 |
+
raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
|
| 352 |
+
|
| 353 |
+
torch_device = torch.device(device)
|
| 354 |
+
device_index = torch_device.index
|
| 355 |
+
|
| 356 |
+
if gpu_id is not None and device_index is not None:
|
| 357 |
+
raise ValueError(
|
| 358 |
+
f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
|
| 359 |
+
f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
# _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
|
| 363 |
+
self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
|
| 364 |
+
|
| 365 |
+
device_type = torch_device.type
|
| 366 |
+
device = torch.device(f"{device_type}:{self._offload_gpu_id}")
|
| 367 |
+
|
| 368 |
+
if self.device.type != "cpu":
|
| 369 |
+
self.to("cpu")
|
| 370 |
+
device_mod = getattr(torch, self.device.type, None)
|
| 371 |
+
if hasattr(device_mod, "empty_cache") and device_mod.is_available():
|
| 372 |
+
device_mod.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
|
| 373 |
+
|
| 374 |
+
all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
|
| 375 |
+
|
| 376 |
+
self._all_hooks = []
|
| 377 |
+
hook = None
|
| 378 |
+
for model_str in self.model_cpu_offload_seq.split("->"):
|
| 379 |
+
model = all_model_components.pop(model_str, None)
|
| 380 |
+
if not isinstance(model, torch.nn.Module):
|
| 381 |
+
continue
|
| 382 |
+
|
| 383 |
+
_, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
|
| 384 |
+
self._all_hooks.append(hook)
|
| 385 |
+
|
| 386 |
+
# CPU offload models that are not in the seq chain unless they are explicitly excluded
|
| 387 |
+
# these models will stay on CPU until maybe_free_model_hooks is called
|
| 388 |
+
# some models cannot be in the seq chain because they are iteratively called, such as controlnet
|
| 389 |
+
for name, model in all_model_components.items():
|
| 390 |
+
if not isinstance(model, torch.nn.Module):
|
| 391 |
+
continue
|
| 392 |
+
|
| 393 |
+
if name in self._exclude_from_cpu_offload:
|
| 394 |
+
model.to(device)
|
| 395 |
+
else:
|
| 396 |
+
_, hook = cpu_offload_with_hook(model, device)
|
| 397 |
+
self._all_hooks.append(hook)
|
| 398 |
+
|
| 399 |
+
def maybe_free_model_hooks(self):
|
| 400 |
+
r"""
|
| 401 |
+
Function that offloads all components, removes all model hooks that were added when using
|
| 402 |
+
`enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
|
| 403 |
+
is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
|
| 404 |
+
functions correctly when applying enable_model_cpu_offload.
|
| 405 |
+
"""
|
| 406 |
+
if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
|
| 407 |
+
# `enable_model_cpu_offload` has not be called, so silently do nothing
|
| 408 |
+
return
|
| 409 |
+
|
| 410 |
+
for hook in self._all_hooks:
|
| 411 |
+
# offload model and remove hook from model
|
| 412 |
+
hook.offload()
|
| 413 |
+
hook.remove()
|
| 414 |
+
|
| 415 |
+
# make sure the model is in the same state as before calling it
|
| 416 |
+
self.enable_model_cpu_offload()
|
| 417 |
|
| 418 |
@synchronize_timer('Encode cond')
|
| 419 |
+
def encode_cond(self, image, additional_cond_inputs, do_classifier_free_guidance, dual_guidance):
|
| 420 |
bsz = image.shape[0]
|
| 421 |
+
cond = self.conditioner(image=image, **additional_cond_inputs)
|
| 422 |
|
| 423 |
if do_classifier_free_guidance:
|
| 424 |
+
un_cond = self.conditioner.unconditional_embedding(bsz, **additional_cond_inputs)
|
| 425 |
|
| 426 |
if dual_guidance:
|
| 427 |
un_cond_drop_main = copy.deepcopy(un_cond)
|
|
|
|
| 437 |
|
| 438 |
cond = cat_recursive(cond, un_cond_drop_main, un_cond)
|
| 439 |
else:
|
|
|
|
|
|
|
| 440 |
def cat_recursive(a, b):
|
| 441 |
if isinstance(a, torch.Tensor):
|
| 442 |
return torch.cat([a, b], dim=0).to(self.dtype)
|
|
|
|
| 482 |
latents = latents * getattr(self.scheduler, 'init_noise_sigma', 1.0)
|
| 483 |
return latents
|
| 484 |
|
| 485 |
+
def prepare_image(self, image) -> dict:
|
| 486 |
if isinstance(image, str) and not os.path.exists(image):
|
| 487 |
raise FileNotFoundError(f"Couldn't find image at path {image}")
|
| 488 |
|
| 489 |
if not isinstance(image, list):
|
| 490 |
image = [image]
|
| 491 |
+
|
| 492 |
+
outputs = []
|
| 493 |
for img in image:
|
| 494 |
+
output = self.image_processor(img)
|
| 495 |
+
outputs.append(output)
|
|
|
|
| 496 |
|
| 497 |
+
cond_input = {k: [] for k in outputs[0].keys()}
|
| 498 |
+
for output in outputs:
|
| 499 |
+
for key, value in output.items():
|
| 500 |
+
cond_input[key].append(value)
|
| 501 |
+
for key, value in cond_input.items():
|
| 502 |
+
if isinstance(value[0], torch.Tensor):
|
| 503 |
+
cond_input[key] = torch.cat(value, dim=0)
|
| 504 |
+
|
| 505 |
+
return cond_input
|
| 506 |
|
| 507 |
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
|
| 508 |
"""
|
|
|
|
| 575 |
getattr(self.model, 'guidance_cond_proj_dim', None) is None
|
| 576 |
dual_guidance = dual_guidance_scale >= 0 and dual_guidance
|
| 577 |
|
| 578 |
+
cond_inputs = self.prepare_image(image)
|
| 579 |
+
image = cond_inputs.pop('image')
|
| 580 |
+
cond = self.encode_cond(
|
| 581 |
+
image=image,
|
| 582 |
+
additional_cond_inputs=cond_inputs,
|
| 583 |
+
do_classifier_free_guidance=do_classifier_free_guidance,
|
| 584 |
+
dual_guidance=False,
|
| 585 |
+
)
|
| 586 |
batch_size = image.shape[0]
|
| 587 |
|
| 588 |
t_dtype = torch.long
|
|
|
|
| 640 |
box_v, mc_level, num_chunks, octree_resolution, mc_algo,
|
| 641 |
)
|
| 642 |
|
| 643 |
+
def _export(
|
| 644 |
+
self,
|
| 645 |
+
latents,
|
| 646 |
+
output_type='trimesh',
|
| 647 |
+
box_v=1.01,
|
| 648 |
+
mc_level=0.0,
|
| 649 |
+
num_chunks=20000,
|
| 650 |
+
octree_resolution=256,
|
| 651 |
+
mc_algo='mc',
|
| 652 |
+
enable_pbar=True
|
| 653 |
+
):
|
| 654 |
if not output_type == "latent":
|
| 655 |
latents = 1. / self.vae.scale_factor * latents
|
| 656 |
latents = self.vae(latents)
|
|
|
|
| 677 |
@torch.inference_mode()
|
| 678 |
def __call__(
|
| 679 |
self,
|
| 680 |
+
image: Union[str, List[str], Image.Image, dict, List[dict]] = None,
|
| 681 |
num_inference_steps: int = 50,
|
| 682 |
timesteps: List[int] = None,
|
| 683 |
sigmas: List[float] = None,
|
|
|
|
| 705 |
self.model.guidance_embed is True
|
| 706 |
)
|
| 707 |
|
| 708 |
+
cond_inputs = self.prepare_image(image)
|
| 709 |
+
image = cond_inputs.pop('image')
|
| 710 |
cond = self.encode_cond(
|
| 711 |
image=image,
|
| 712 |
+
additional_cond_inputs=cond_inputs,
|
| 713 |
do_classifier_free_guidance=do_classifier_free_guidance,
|
| 714 |
dual_guidance=False,
|
| 715 |
)
|
hy3dgen/shapegen/postprocessors.py
CHANGED
|
@@ -12,13 +12,16 @@
|
|
| 12 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
| 13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 14 |
|
|
|
|
| 15 |
import tempfile
|
| 16 |
from typing import Union
|
| 17 |
|
|
|
|
| 18 |
import pymeshlab
|
|
|
|
| 19 |
import trimesh
|
| 20 |
|
| 21 |
-
from .models.
|
| 22 |
from .utils import synchronize_timer
|
| 23 |
|
| 24 |
|
|
|
|
| 12 |
# fine-tuning enabling code and other elements of the foregoing made publicly available
|
| 13 |
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
|
| 14 |
|
| 15 |
+
import os
|
| 16 |
import tempfile
|
| 17 |
from typing import Union
|
| 18 |
|
| 19 |
+
import numpy as np
|
| 20 |
import pymeshlab
|
| 21 |
+
import torch
|
| 22 |
import trimesh
|
| 23 |
|
| 24 |
+
from .models.autoencoders import Latent2MeshOutput
|
| 25 |
from .utils import synchronize_timer
|
| 26 |
|
| 27 |
|
hy3dgen/shapegen/preprocessors.py
CHANGED
|
@@ -87,9 +87,7 @@ class ImageProcessorV2:
|
|
| 87 |
mask = mask.clip(0, 255).astype(np.uint8)
|
| 88 |
return result, mask
|
| 89 |
|
| 90 |
-
def
|
| 91 |
-
if self.border_ratio is not None:
|
| 92 |
-
border_ratio = self.border_ratio
|
| 93 |
if isinstance(image, str):
|
| 94 |
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
| 95 |
image, mask = self.recenter(image, border_ratio=border_ratio)
|
|
@@ -106,13 +104,64 @@ class ImageProcessorV2:
|
|
| 106 |
if to_tensor:
|
| 107 |
image = array_to_tensor(image)
|
| 108 |
mask = array_to_tensor(mask)
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
IMAGE_PROCESSORS = {
|
| 115 |
"v2": ImageProcessorV2,
|
|
|
|
| 116 |
}
|
| 117 |
|
| 118 |
DEFAULT_IMAGEPROCESSOR = 'v2'
|
|
|
|
| 87 |
mask = mask.clip(0, 255).astype(np.uint8)
|
| 88 |
return result, mask
|
| 89 |
|
| 90 |
+
def load_image(self, image, border_ratio=0.15, to_tensor=True):
|
|
|
|
|
|
|
| 91 |
if isinstance(image, str):
|
| 92 |
image = cv2.imread(image, cv2.IMREAD_UNCHANGED)
|
| 93 |
image, mask = self.recenter(image, border_ratio=border_ratio)
|
|
|
|
| 104 |
if to_tensor:
|
| 105 |
image = array_to_tensor(image)
|
| 106 |
mask = array_to_tensor(mask)
|
| 107 |
+
return image, mask
|
| 108 |
+
|
| 109 |
+
def __call__(self, image, border_ratio=0.15, to_tensor=True, **kwargs):
|
| 110 |
+
if self.border_ratio is not None:
|
| 111 |
+
border_ratio = self.border_ratio
|
| 112 |
+
image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
|
| 113 |
+
outputs = {
|
| 114 |
+
'image': image,
|
| 115 |
+
'mask': mask
|
| 116 |
+
}
|
| 117 |
+
return outputs
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class MVImageProcessorV2(ImageProcessorV2):
|
| 121 |
+
"""
|
| 122 |
+
view order: front, front clockwise 90, back, front clockwise 270
|
| 123 |
+
"""
|
| 124 |
+
return_view_idx = True
|
| 125 |
+
|
| 126 |
+
def __init__(self, size=512, border_ratio=None):
|
| 127 |
+
super().__init__(size, border_ratio)
|
| 128 |
+
self.view2idx = {
|
| 129 |
+
'front': 0,
|
| 130 |
+
'left': 1,
|
| 131 |
+
'back': 2,
|
| 132 |
+
'right': 3
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def __call__(self, image_dict, border_ratio=0.15, to_tensor=True, **kwargs):
|
| 136 |
+
if self.border_ratio is not None:
|
| 137 |
+
border_ratio = self.border_ratio
|
| 138 |
+
|
| 139 |
+
images = []
|
| 140 |
+
masks = []
|
| 141 |
+
view_idxs = []
|
| 142 |
+
for idx, (view_tag, image) in enumerate(image_dict.items()):
|
| 143 |
+
view_idxs.append(self.view2idx[view_tag])
|
| 144 |
+
image, mask = self.load_image(image, border_ratio=border_ratio, to_tensor=to_tensor)
|
| 145 |
+
images.append(image)
|
| 146 |
+
masks.append(mask)
|
| 147 |
+
|
| 148 |
+
zipped_lists = zip(view_idxs, images, masks)
|
| 149 |
+
sorted_zipped_lists = sorted(zipped_lists)
|
| 150 |
+
view_idxs, images, masks = zip(*sorted_zipped_lists)
|
| 151 |
+
|
| 152 |
+
image = torch.cat(images, 0).unsqueeze(0)
|
| 153 |
+
mask = torch.cat(masks, 0).unsqueeze(0)
|
| 154 |
+
outputs = {
|
| 155 |
+
'image': image,
|
| 156 |
+
'mask': mask,
|
| 157 |
+
'view_idxs': view_idxs
|
| 158 |
+
}
|
| 159 |
+
return outputs
|
| 160 |
|
| 161 |
|
| 162 |
IMAGE_PROCESSORS = {
|
| 163 |
"v2": ImageProcessorV2,
|
| 164 |
+
'mv_v2': MVImageProcessorV2,
|
| 165 |
}
|
| 166 |
|
| 167 |
DEFAULT_IMAGEPROCESSOR = 'v2'
|