Spaces:

omnipart
/

OmniPart

Running on Zero

File size: 9,005 Bytes

491eded

"""
decoder_gs.py: Structured Latent Gaussian Decoder for 3D Representation Learning

This file contains decoder implementations that transform latent codes into 3D Gaussian
representations. The decoders use sparse transformer architectures for efficient processing
and flexible attention mechanisms. The main components are:
- SLatGaussianDecoder: Core decoder that maps latent codes to 3D Gaussians
- ElasticSLatGaussianDecoder: Memory-efficient variant with elastic memory management
"""

from typing import *
import torch
import torch.nn as nn
import torch.nn.functional as F
from ...modules import sparse as sp
from ...utils.random_utils import hammersley_sequence
from .base import SparseTransformerBase
from ...representations import Gaussian
from ..sparse_elastic_mixin import SparseTransformerElasticMixin


class SLatGaussianDecoder(SparseTransformerBase):
    """
    Sparse Transformer-based decoder that converts latent codes to 3D Gaussian representations.
    
    This decoder processes sparse tensors and outputs parameters for Gaussian primitives
    that can be rendered in 3D space, including positions, features, scaling, rotation,
    and opacity.
    """
    def __init__(
        self,
        resolution: int,  # The resolution of the 3D grid
        model_channels: int,  # Number of channels in the transformer layers
        latent_channels: int,  # Number of channels in the input latent code
        num_blocks: int,  # Number of transformer blocks
        num_heads: Optional[int] = None,  # Number of attention heads
        num_head_channels: Optional[int] = 64,  # Channels per attention head
        mlp_ratio: float = 4,  # Ratio for MLP size in transformer blocks
        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "swin",  # Attention mechanism
        window_size: int = 8,  # Size of attention windows for windowed attention
        pe_mode: Literal["ape", "rope"] = "ape",  # Positional encoding mode
        use_fp16: bool = False,  # Whether to use half-precision
        use_checkpoint: bool = False,  # Whether to use gradient checkpointing
        qk_rms_norm: bool = False,  # Whether to use RMS normalization for attention
        representation_config: dict = None,  # Configuration for the Gaussian representation
    ):
        super().__init__(
            in_channels=latent_channels,
            model_channels=model_channels,
            num_blocks=num_blocks,
            num_heads=num_heads,
            num_head_channels=num_head_channels,
            mlp_ratio=mlp_ratio,
            attn_mode=attn_mode,
            window_size=window_size,
            pe_mode=pe_mode,
            use_fp16=use_fp16,
            use_checkpoint=use_checkpoint,
            qk_rms_norm=qk_rms_norm,
        )
        self.resolution = resolution
        self.rep_config = representation_config
        self._calc_layout()  # Calculate output tensor layout
        self.out_layer = sp.SparseLinear(model_channels, self.out_channels)  # Final projection layer
        self._build_perturbation()  # Build position perturbation for better initialization

        self.initialize_weights()
        if use_fp16:
            self.convert_to_fp16()

    def initialize_weights(self) -> None:
        """
        Initialize model weights, with special handling for output layers.
        Zero-initializes the output layer for stability.
        """
        super().initialize_weights()
        # Zero-out output layers:
        nn.init.constant_(self.out_layer.weight, 0)
        nn.init.constant_(self.out_layer.bias, 0)

    def _build_perturbation(self) -> None:
        """
        Build position perturbation for Gaussian means.
        Uses Hammersley sequence for quasi-random uniform distribution of points,
        then transforms to match the desired Gaussian spatial distribution.
        """
        perturbation = [hammersley_sequence(3, i, self.rep_config['num_gaussians']) for i in range(self.rep_config['num_gaussians'])]
        perturbation = torch.tensor(perturbation).float() * 2 - 1  # Scale to [-1, 1]
        perturbation = perturbation / self.rep_config['voxel_size']  # Scale by voxel size
        perturbation = torch.atanh(perturbation).to(self.device)  # Apply inverse tanh for better gradient flow
        self.register_buffer('offset_perturbation', perturbation)  # Register as buffer (not a parameter)

    def _calc_layout(self) -> None:
        """
        Calculate the layout of the output tensor.
        Defines the shape and size of each Gaussian parameter group (position, features, scaling, rotation, opacity)
        and their positions in the output tensor.
        """
        self.layout = {
            '_xyz' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
            '_features_dc' : {'shape': (self.rep_config['num_gaussians'], 1, 3), 'size': self.rep_config['num_gaussians'] * 3},
            '_scaling' : {'shape': (self.rep_config['num_gaussians'], 3), 'size': self.rep_config['num_gaussians'] * 3},
            '_rotation' : {'shape': (self.rep_config['num_gaussians'], 4), 'size': self.rep_config['num_gaussians'] * 4},
            '_opacity' : {'shape': (self.rep_config['num_gaussians'], 1), 'size': self.rep_config['num_gaussians']},
        }
        # Calculate ranges for each parameter group in the flattened output tensor
        start = 0
        for k, v in self.layout.items():
            v['range'] = (start, start + v['size'])
            start += v['size']
        self.out_channels = start  # Total number of output channels
    
    def to_representation(self, x: sp.SparseTensor) -> List[Gaussian]:
        """
        Convert a batch of network outputs to 3D Gaussian representations.

        Args:
            x: The [N x * x C] sparse tensor output by the network.

        Returns:
            list of Gaussian representations, one per batch item
        """
        ret = []
        for i in range(x.shape[0]):
            # Create a new Gaussian representation object with proper configuration
            representation = Gaussian(
                sh_degree=0,  # No spherical harmonics, just using DC term
                aabb=[-0.5, -0.5, -0.5, 1.0, 1.0, 1.0],  # Axis-aligned bounding box
                mininum_kernel_size = self.rep_config['3d_filter_kernel_size'],
                scaling_bias = self.rep_config['scaling_bias'],
                opacity_bias = self.rep_config['opacity_bias'],
                scaling_activation = self.rep_config['scaling_activation']
            )
            # Get base positions from sparse tensor coordinates
            xyz = (x.coords[x.layout[i]][:, 1:].float() + 0.5) / self.resolution
            
            # Process each parameter group
            for k, v in self.layout.items():
                if k == '_xyz':
                    # Handle positions with special perturbation logic
                    offset = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape'])
                    offset = offset * self.rep_config['lr'][k]  # Apply learning rate scale
                    if self.rep_config['perturb_offset']:
                        offset = offset + self.offset_perturbation  # Add perturbation
                    # Transform offsets through tanh and scale appropriately
                    offset = torch.tanh(offset) / self.resolution * 0.5 * self.rep_config['voxel_size']
                    _xyz = xyz.unsqueeze(1) + offset
                    setattr(representation, k, _xyz.flatten(0, 1))
                else:
                    # Handle other parameters (features, scaling, rotation, opacity)
                    feats = x.feats[x.layout[i]][:, v['range'][0]:v['range'][1]].reshape(-1, *v['shape']).flatten(0, 1)
                    feats = feats * self.rep_config['lr'][k]  # Apply parameter-specific learning rate
                    setattr(representation, k, feats)
            ret.append(representation)
        return ret

    def forward(self, x: sp.SparseTensor) -> List[Gaussian]:
        """
        Forward pass through the decoder.
        
        Args:
            x: Input sparse tensor containing latent codes
            
        Returns:
            List of Gaussian representations ready for rendering
        """
        h = super().forward(x)  # Process through transformer blocks
        h = h.type(x.dtype)  # Ensure consistent dtype
        h = h.replace(F.layer_norm(h.feats, h.feats.shape[-1:]))  # Apply layer normalization
        h = self.out_layer(h)  # Project to final output dimensions
        return self.to_representation(h)  # Convert to Gaussian representations
    

class ElasticSLatGaussianDecoder(SparseTransformerElasticMixin, SparseTransformerBase):
    """
    Slat VAE Gaussian decoder with elastic memory management.
    Used for training with low VRAM by dynamically managing memory allocations
    and using efficient sparse operations.
    """
    pass