Spaces:

omnipart
/

OmniPart

Running on Zero

File size: 7,373 Bytes

491eded

"""
Base Sparse Transformer Implementation for TRELLIS Framework

This file implements the base architecture for sparse transformers used in structured latent variable models.
It provides a configurable foundation with multiple attention mechanisms (full, windowed, shifted window)
and supports different positional encoding strategies. The sparse implementation allows for efficient
processing of data with varying density patterns.

The main class SparseTransformerBase serves as the foundation for encoder and decoder implementations
in the structured latent VAE models.
"""

from typing import *
import torch
import torch.nn as nn
from ...modules.utils import convert_module_to_f16, convert_module_to_f32
from ...modules import sparse as sp
from ...modules.transformer import AbsolutePositionEmbedder
from ...modules.sparse.transformer import SparseTransformerBlock


def block_attn_config(self):
    """
    Return the attention configuration for each transformer block.
    
    Generates configurations for each block based on the specified attention mode:
    - shift_window: Uses serialized attention with shifting window patterns
    - shift_sequence: Uses serialized attention with sequence shifts
    - shift_order: Uses serialized attention with different serialization orders
    - full: Uses standard full attention (non-sparse)
    - swin: Uses Swin Transformer-style windowed attention
    
    Yields:
        Tuple containing attention mode and its parameters
    """
    for i in range(self.num_blocks):
        if self.attn_mode == "shift_window":
            yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
        elif self.attn_mode == "shift_sequence":
            yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
        elif self.attn_mode == "shift_order":
            yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
        elif self.attn_mode == "full":
            yield "full", None, None, None, None
        elif self.attn_mode == "swin":
            yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None


class SparseTransformerBase(nn.Module):
    """
    Sparse Transformer without output layers.
    Serve as the base class for encoder and decoder.
    
    Implements a transformer architecture that can work with sparse data structures,
    supporting various attention mechanisms and positional encodings.
    """
    def __init__(
        self,
        in_channels: int,
        model_channels: int,
        num_blocks: int,
        num_heads: Optional[int] = None,
        num_head_channels: Optional[int] = 64,
        mlp_ratio: float = 4.0,
        attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
        window_size: Optional[int] = None,
        pe_mode: Literal["ape", "rope"] = "ape",
        use_fp16: bool = False,
        use_checkpoint: bool = False,
        qk_rms_norm: bool = False,
    ):
        """
        Initialize the sparse transformer base model.
        
        Args:
            in_channels: Number of input channels
            model_channels: Hidden dimension size
            num_blocks: Number of transformer blocks
            num_heads: Number of attention heads (calculated from head_channels if None)
            num_head_channels: Number of channels per attention head
            mlp_ratio: Ratio for MLP hidden dimension
            attn_mode: Attention mechanism type
            window_size: Size of attention window for windowed modes
            pe_mode: Positional encoding mode (absolute or rotary)
            use_fp16: Whether to use half precision
            use_checkpoint: Whether to use gradient checkpointing
            qk_rms_norm: Whether to use RMS normalization for query and key
        """
        super().__init__()
        self.in_channels = in_channels
        self.model_channels = model_channels
        self.num_blocks = num_blocks
        self.window_size = window_size
        self.num_heads = num_heads or model_channels // num_head_channels
        self.mlp_ratio = mlp_ratio
        self.attn_mode = attn_mode
        self.pe_mode = pe_mode
        self.use_fp16 = use_fp16
        self.use_checkpoint = use_checkpoint
        self.qk_rms_norm = qk_rms_norm
        self.dtype = torch.float16 if use_fp16 else torch.float32

        # Create positional embedder if using absolute positional encoding
        if pe_mode == "ape":
            self.pos_embedder = AbsolutePositionEmbedder(model_channels)

        # Input projection layer
        self.input_layer = sp.SparseLinear(in_channels, model_channels)
        
        # Build transformer blocks with configurations from block_attn_config
        self.blocks = nn.ModuleList([
            SparseTransformerBlock(
                model_channels,
                num_heads=self.num_heads,
                mlp_ratio=self.mlp_ratio,
                attn_mode=attn_mode,
                window_size=window_size,
                shift_sequence=shift_sequence,
                shift_window=shift_window,
                serialize_mode=serialize_mode,
                use_checkpoint=self.use_checkpoint,
                use_rope=(pe_mode == "rope"),
                qk_rms_norm=self.qk_rms_norm,
            )
            for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
        ])

    @property
    def device(self) -> torch.device:
        """
        Return the device of the model.
        """
        return next(self.parameters()).device

    def convert_to_fp16(self) -> None:
        """
        Convert the torso of the model to float16 precision.
        Used for mixed precision training.
        """
        self.blocks.apply(convert_module_to_f16)

    def convert_to_fp32(self) -> None:
        """
        Convert the torso of the model back to float32 precision.
        Used after mixed precision training or inference.
        """
        self.blocks.apply(convert_module_to_f32)

    def initialize_weights(self) -> None:
        """
        Initialize the weights of the model using Xavier uniform initialization.
        This helps with training stability and convergence.
        """
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
        self.apply(_basic_init)

    def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
        """
        Forward pass through the sparse transformer.
        
        Args:
            x: Input sparse tensor
            
        Returns:
            Processed sparse tensor after passing through all transformer blocks
        """
        # Project input to model dimension
        h = self.input_layer(x)
        
        # Add positional embeddings if using absolute positional encoding
        if self.pe_mode == "ape":
            h = h + self.pos_embedder(x.coords[:, 1:])
        
        # Convert to target precision
        h = h.type(self.dtype)
        
        # Pass through transformer blocks sequentially
        for block in self.blocks:
            h = block(h)
            
        return h