omnipart's picture
init
491eded
"""
Base Sparse Transformer Implementation for TRELLIS Framework
This file implements the base architecture for sparse transformers used in structured latent variable models.
It provides a configurable foundation with multiple attention mechanisms (full, windowed, shifted window)
and supports different positional encoding strategies. The sparse implementation allows for efficient
processing of data with varying density patterns.
The main class SparseTransformerBase serves as the foundation for encoder and decoder implementations
in the structured latent VAE models.
"""
from typing import *
import torch
import torch.nn as nn
from ...modules.utils import convert_module_to_f16, convert_module_to_f32
from ...modules import sparse as sp
from ...modules.transformer import AbsolutePositionEmbedder
from ...modules.sparse.transformer import SparseTransformerBlock
def block_attn_config(self):
"""
Return the attention configuration for each transformer block.
Generates configurations for each block based on the specified attention mode:
- shift_window: Uses serialized attention with shifting window patterns
- shift_sequence: Uses serialized attention with sequence shifts
- shift_order: Uses serialized attention with different serialization orders
- full: Uses standard full attention (non-sparse)
- swin: Uses Swin Transformer-style windowed attention
Yields:
Tuple containing attention mode and its parameters
"""
for i in range(self.num_blocks):
if self.attn_mode == "shift_window":
yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
elif self.attn_mode == "shift_sequence":
yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
elif self.attn_mode == "shift_order":
yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
elif self.attn_mode == "full":
yield "full", None, None, None, None
elif self.attn_mode == "swin":
yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None
class SparseTransformerBase(nn.Module):
"""
Sparse Transformer without output layers.
Serve as the base class for encoder and decoder.
Implements a transformer architecture that can work with sparse data structures,
supporting various attention mechanisms and positional encodings.
"""
def __init__(
self,
in_channels: int,
model_channels: int,
num_blocks: int,
num_heads: Optional[int] = None,
num_head_channels: Optional[int] = 64,
mlp_ratio: float = 4.0,
attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
window_size: Optional[int] = None,
pe_mode: Literal["ape", "rope"] = "ape",
use_fp16: bool = False,
use_checkpoint: bool = False,
qk_rms_norm: bool = False,
):
"""
Initialize the sparse transformer base model.
Args:
in_channels: Number of input channels
model_channels: Hidden dimension size
num_blocks: Number of transformer blocks
num_heads: Number of attention heads (calculated from head_channels if None)
num_head_channels: Number of channels per attention head
mlp_ratio: Ratio for MLP hidden dimension
attn_mode: Attention mechanism type
window_size: Size of attention window for windowed modes
pe_mode: Positional encoding mode (absolute or rotary)
use_fp16: Whether to use half precision
use_checkpoint: Whether to use gradient checkpointing
qk_rms_norm: Whether to use RMS normalization for query and key
"""
super().__init__()
self.in_channels = in_channels
self.model_channels = model_channels
self.num_blocks = num_blocks
self.window_size = window_size
self.num_heads = num_heads or model_channels // num_head_channels
self.mlp_ratio = mlp_ratio
self.attn_mode = attn_mode
self.pe_mode = pe_mode
self.use_fp16 = use_fp16
self.use_checkpoint = use_checkpoint
self.qk_rms_norm = qk_rms_norm
self.dtype = torch.float16 if use_fp16 else torch.float32
# Create positional embedder if using absolute positional encoding
if pe_mode == "ape":
self.pos_embedder = AbsolutePositionEmbedder(model_channels)
# Input projection layer
self.input_layer = sp.SparseLinear(in_channels, model_channels)
# Build transformer blocks with configurations from block_attn_config
self.blocks = nn.ModuleList([
SparseTransformerBlock(
model_channels,
num_heads=self.num_heads,
mlp_ratio=self.mlp_ratio,
attn_mode=attn_mode,
window_size=window_size,
shift_sequence=shift_sequence,
shift_window=shift_window,
serialize_mode=serialize_mode,
use_checkpoint=self.use_checkpoint,
use_rope=(pe_mode == "rope"),
qk_rms_norm=self.qk_rms_norm,
)
for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
])
@property
def device(self) -> torch.device:
"""
Return the device of the model.
"""
return next(self.parameters()).device
def convert_to_fp16(self) -> None:
"""
Convert the torso of the model to float16 precision.
Used for mixed precision training.
"""
self.blocks.apply(convert_module_to_f16)
def convert_to_fp32(self) -> None:
"""
Convert the torso of the model back to float32 precision.
Used after mixed precision training or inference.
"""
self.blocks.apply(convert_module_to_f32)
def initialize_weights(self) -> None:
"""
Initialize the weights of the model using Xavier uniform initialization.
This helps with training stability and convergence.
"""
def _basic_init(module):
if isinstance(module, nn.Linear):
torch.nn.init.xavier_uniform_(module.weight)
if module.bias is not None:
nn.init.constant_(module.bias, 0)
self.apply(_basic_init)
def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
"""
Forward pass through the sparse transformer.
Args:
x: Input sparse tensor
Returns:
Processed sparse tensor after passing through all transformer blocks
"""
# Project input to model dimension
h = self.input_layer(x)
# Add positional embeddings if using absolute positional encoding
if self.pe_mode == "ape":
h = h + self.pos_embedder(x.coords[:, 1:])
# Convert to target precision
h = h.type(self.dtype)
# Pass through transformer blocks sequentially
for block in self.blocks:
h = block(h)
return h