Spaces:

stepfun-ai
/

Step1X-3D

Running on Zero

Step1X-3D / step1x3d_geometry /models /autoencoders /volume_decoders.py

2ac1c2d 2 months ago

12.1 kB

	# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
	# except for the third-party components listed below.
	# Hunyuan 3D does not impose any additional limitations beyond what is outlined
	# in the repsective licenses of these third-party components.
	# Users must comply with all terms and conditions of original licenses of these third-party
	# components and must ensure that the usage of the third party components adheres to
	# all relevant laws and regulations.

	# For avoidance of doubts, Hunyuan 3D means the large language models and
	# their software and algorithms, including trained model weights, parameters (including
	# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
	# fine-tuning enabling code and other elements of the foregoing made publicly available
	# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.

	from typing import Union, Tuple, List, Callable

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from einops import repeat
	from tqdm import tqdm

	cube_corners = torch.tensor(
	[
	[0, 0, 0],
	[1, 0, 0],
	[0, 1, 0],
	[1, 1, 0],
	[0, 0, 1],
	[1, 0, 1],
	[0, 1, 1],
	[1, 1, 1],
	],
	dtype=torch.int,
	)


	def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float):
	device = input_tensor.device
	D = input_tensor.shape[0]
	signed_val = 0.0

	# 添加偏移并处理无效值
	val = input_tensor + alpha
	valid_mask = val > -9000 # 假设-9000是无效值

	# 改进的邻居获取函数（保持维度一致）
	def get_neighbor(t, shift, axis):
	"""根据指定轴进行位移并保持维度一致"""
	if shift == 0:
	return t.clone()

	# 确定填充轴（输入为[D, D, D]对应z,y,x轴）
	pad_dims = [0, 0, 0, 0, 0, 0] # 格式：[x前，x后，y前，y后，z前，z后]

	# 根据轴类型设置填充
	if axis == 0: # x轴（最后一个维度）
	pad_idx = 0 if shift > 0 else 1
	pad_dims[pad_idx] = abs(shift)
	elif axis == 1: # y轴（中间维度）
	pad_idx = 2 if shift > 0 else 3
	pad_dims[pad_idx] = abs(shift)
	elif axis == 2: # z轴（第一个维度）
	pad_idx = 4 if shift > 0 else 5
	pad_dims[pad_idx] = abs(shift)

	# 执行填充（添加batch和channel维度适配F.pad）
	padded = F.pad(
	t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode="replicate"
	) # 反转顺序适配F.pad

	# 构建动态切片索引
	slice_dims = [slice(None)] * 3 # 初始化为全切片
	if axis == 0: # x轴（dim=2）
	if shift > 0:
	slice_dims[0] = slice(shift, None)
	else:
	slice_dims[0] = slice(None, shift)
	elif axis == 1: # y轴（dim=1）
	if shift > 0:
	slice_dims[1] = slice(shift, None)
	else:
	slice_dims[1] = slice(None, shift)
	elif axis == 2: # z轴（dim=0）
	if shift > 0:
	slice_dims[2] = slice(shift, None)
	else:
	slice_dims[2] = slice(None, shift)

	# 应用切片并恢复维度
	padded = padded.squeeze(0).squeeze(0)
	sliced = padded[slice_dims]
	return sliced

	# 获取各方向邻居（确保维度一致）
	left = get_neighbor(val, 1, axis=0) # x方向
	right = get_neighbor(val, -1, axis=0)
	back = get_neighbor(val, 1, axis=1) # y方向
	front = get_neighbor(val, -1, axis=1)
	down = get_neighbor(val, 1, axis=2) # z方向
	up = get_neighbor(val, -1, axis=2)

	# 处理边界无效值（使用where保持维度一致）
	def safe_where(neighbor):
	return torch.where(neighbor > -9000, neighbor, val)

	left = safe_where(left)
	right = safe_where(right)
	back = safe_where(back)
	front = safe_where(front)
	down = safe_where(down)
	up = safe_where(up)

	# 计算符号一致性（转换为float32确保精度）
	sign = torch.sign(val.to(torch.float32))
	neighbors_sign = torch.stack(
	[
	torch.sign(left.to(torch.float32)),
	torch.sign(right.to(torch.float32)),
	torch.sign(back.to(torch.float32)),
	torch.sign(front.to(torch.float32)),
	torch.sign(down.to(torch.float32)),
	torch.sign(up.to(torch.float32)),
	],
	dim=0,
	)

	# 检查所有符号是否一致
	same_sign = torch.all(neighbors_sign == sign, dim=0)

	# 生成最终掩码
	mask = (~same_sign).to(torch.int32)
	return mask * valid_mask.to(torch.int32)


	def generate_dense_grid_points(
	bbox_min: np.ndarray,
	bbox_max: np.ndarray,
	octree_resolution: int,
	indexing: str = "ij",
	):
	length = bbox_max - bbox_min
	num_cells = octree_resolution

	x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
	y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
	z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
	[xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
	xyz = np.stack((xs, ys, zs), axis=-1)
	grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]

	return xyz, grid_size, length


	class VanillaVolumeDecoder:
	@torch.no_grad()
	def __call__(
	self,
	latents: torch.FloatTensor,
	geo_decoder: Callable,
	bounds: Union[Tuple[float], List[float], float] = 1.01,
	num_chunks: int = 10000,
	octree_resolution: int = 384,
	enable_pbar: bool = True,
	**kwargs,
	):
	device = latents.device
	dtype = latents.dtype
	batch_size = latents.shape[0]

	# 1. generate query points
	if isinstance(bounds, float):
	bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]

	bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
	xyz_samples, grid_size, length = generate_dense_grid_points(
	bbox_min=bbox_min,
	bbox_max=bbox_max,
	octree_resolution=octree_resolution,
	indexing="ij",
	)
	xyz_samples = (
	torch.from_numpy(xyz_samples)
	.to(device, dtype=dtype)
	.contiguous()
	.reshape(-1, 3)
	)

	# 2. latents to 3d volume
	batch_features = []
	for start in tqdm(
	range(0, xyz_samples.shape[0], num_chunks),
	desc=f"Volume Decoding",
	disable=not enable_pbar,
	):
	chunk_queries = xyz_samples[start : start + num_chunks, :]
	chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
	features = geo_decoder(queries=chunk_queries, latents=latents)
	batch_features.append(features)

	grid_features = torch.cat(batch_features, dim=1)
	grid_logits, grid_features = grid_features[..., 0:1], grid_features[..., 1:]
	grid_logits = grid_logits.view((batch_size, *grid_size)).float()

	return grid_logits, xyz_samples, grid_features, None


	class HierarchicalVolumeDecoder:
	@torch.no_grad()
	def __call__(
	self,
	latents: torch.FloatTensor,
	geo_decoder: Callable,
	bounds: Union[Tuple[float], List[float], float] = 1.01,
	num_chunks: int = 65536,
	mc_level: float = 0.0,
	octree_resolution: int = 384,
	min_resolution: int = 63,
	enable_pbar: bool = True,
	empty_value: float = float("nan"),
	**kwargs,
	):
	device = latents.device
	dtype = latents.dtype

	resolutions = []
	if octree_resolution < min_resolution:
	resolutions.append(octree_resolution)
	while octree_resolution >= min_resolution:
	resolutions.append(octree_resolution)
	octree_resolution = octree_resolution // 2
	resolutions.reverse()

	# 1. generate query points
	if isinstance(bounds, float):
	bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
	bbox_min = np.array(bounds[0:3])
	bbox_max = np.array(bounds[3:6])
	bbox_size = bbox_max - bbox_min

	xyz_samples, grid_size, length = generate_dense_grid_points(
	bbox_min=bbox_min,
	bbox_max=bbox_max,
	octree_resolution=resolutions[0],
	indexing="ij",
	)

	dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
	dilate.weight = torch.nn.Parameter(
	torch.ones(dilate.weight.shape, dtype=dtype, device=device)
	)

	grid_size = np.array(grid_size)
	xyz_samples = (
	torch.from_numpy(xyz_samples)
	.to(device, dtype=dtype)
	.contiguous()
	.reshape(-1, 3)
	)

	# 2. latents to 3d volume
	batch_features = []
	batch_size = latents.shape[0]
	for start in tqdm(
	range(0, xyz_samples.shape[0], num_chunks),
	desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]",
	disable=not enable_pbar,
	):
	queries = xyz_samples[start : start + num_chunks, :]
	batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
	features = geo_decoder(queries=batch_queries, latents=latents)
	batch_features.append(features)

	grid_features = torch.cat(batch_features, dim=1).view(
	(batch_size, grid_size[0], grid_size[1], grid_size[2], -1)
	)
	grid_logits = grid_features[..., 0] # assume the first element is the logits

	for octree_depth_now in resolutions[1:]:
	grid_size = np.array([octree_depth_now + 1] * 3)
	resolution = bbox_size / octree_depth_now
	next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
	next_logits = torch.full(
	next_index.shape, -10000.0, dtype=dtype, device=device
	)
	curr_points = extract_near_surface_volume_fn(
	grid_logits.squeeze(0), mc_level
	)
	curr_points += grid_logits.squeeze(0).abs() < 0.95

	if octree_depth_now == resolutions[-1]:
	expand_num = 0
	else:
	expand_num = 1
	for i in range(expand_num):
	curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
	(cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
	next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
	for i in range(2 - expand_num):
	next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
	nidx = torch.where(next_index > 0)

	next_points = torch.stack(nidx, dim=1)
	next_points = next_points * torch.tensor(
	resolution, dtype=latents.dtype, device=device
	) + torch.tensor(bbox_min, dtype=latents.dtype, device=device)

	batch_features = []
	for start in tqdm(
	range(0, next_points.shape[0], num_chunks),
	desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]",
	disable=not enable_pbar,
	):
	queries = next_points[start : start + num_chunks, :]
	batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
	features = geo_decoder(
	queries=batch_queries.to(latents.dtype), latents=latents
	)
	batch_features.append(features)
	grid_features = torch.cat(batch_features, dim=1)
	grid_logits = grid_features[..., 0:1]
	next_logits[nidx] = grid_logits[0, ..., 0]
	grid_logits = next_logits.unsqueeze(0)
	grid_logits[grid_logits == -10000.0] = empty_value

	return grid_logits