File size: 23,642 Bytes

6da2a44

import torch
import torch.nn as nn
from models.encoder import SparseConvNeXtLayerNorm, _get_active_ex_or_ii
from typing import Optional, Sequence, Tuple, Union, List
import numpy as np
from models.mamba.bi_vision_mamba import Mamba
from monai.networks.blocks.unetr_block import UnetrUpBlock

def build_3d_sincos_position_embedding(grid_size, embed_dim, num_tokens=0, temperature=10000.):
    grid_size = (grid_size, grid_size, grid_size)
    h, w, d = grid_size
    grid_h = torch.arange(h, dtype=torch.float32)
    grid_w = torch.arange(w, dtype=torch.float32)
    grid_d = torch.arange(d, dtype=torch.float32)

    grid_h, grid_w, grid_d = torch.meshgrid(grid_h, grid_w, grid_d)
    assert embed_dim % 6 == 0, 'Embed dimension must be divisible by 6 for 3D sin-cos position embedding'
    pos_dim = embed_dim // 6
    omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
    omega = 1. / (temperature ** omega)
    out_h = torch.einsum('m,d->md', [grid_h.flatten(), omega])
    out_w = torch.einsum('m,d->md', [grid_w.flatten(), omega])
    out_d = torch.einsum('m,d->md', [grid_d.flatten(), omega])
    pos_emb = torch.cat(
        [torch.sin(out_h), torch.cos(out_h), torch.sin(out_w), torch.cos(out_w), torch.sin(out_d), torch.cos(out_d)],
        dim=1)[None, :, :]

    assert num_tokens == 1 or num_tokens == 0, "Number of tokens must be of 0 or 1"
    if num_tokens == 1:
        pe_token = torch.zeros([1, 1, embed_dim], dtype=torch.float32)
        pos_embed = nn.Parameter(torch.cat([pe_token, pos_emb], dim=1))
    else:
        pos_embed = nn.Parameter(pos_emb)
    pos_embed.requires_grad = False
    return pos_embed


class MlpChannel(nn.Module):
    def __init__(self, hidden_size, mlp_dim):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, mlp_dim)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(mlp_dim, hidden_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x


class MambaLayer(nn.Module):
    def __init__(self, dim, d_state=16, d_conv=4, expand=2):
        super().__init__()
        self.dim = dim
        self.norm1 = nn.LayerNorm(dim)
        self.mamba = Mamba(
            d_model=dim,  # Model dimension d_model
            d_state=d_state,  # SSM state expansion factor
            d_conv=d_conv,  # Local convolution width
            expand=expand,  # Block expansion factor
            bimamba_type="v1",
        )
        self.mlp = MlpChannel(hidden_size=dim, mlp_dim=2 * dim)
        self.norm2 = nn.LayerNorm(dim)
    def forward(self, x):
        x = self.mamba(self.norm1(x)) + x
        x = self.mlp(self.norm2(x)) + x
        return x


class MaskedAutoencoderMamba(nn.Module):
    """ Masked Autoencoder with VisionTransformer backbone

    """

    def __init__(self, img_size=96, downsample_rato=16, embed_dim=384, depth=8, norm_layer=nn.LayerNorm, sparse=True):
        super().__init__()
        print("mamba sparse: ", sparse)
        # --------------------------------------------------------------------------
        # MAE encoder specifics
        self.grid_size = img_size // downsample_rato
        self.num_patches = (self.grid_size) ** 3
        self.embed_dim = embed_dim
        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches, embed_dim),
                                      requires_grad=False)  # fixed sin-cos embedding
        
        self.blocks = nn.ModuleList([
            MambaLayer(dim=embed_dim)
            for i in range(depth)])
        # self.gsc = GSC(in_channels=embed_dim, sparse=sparse)
       
        self.sparse = sparse
        if self.sparse:
            self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        # --------------------------------------------------------------------------
        self.initialize_weights()

    def initialize_weights(self):
        # initialization
        # initialize (and freeze) pos_embed by sin-cos embedding
        pos_embed = build_3d_sincos_position_embedding(self.grid_size, self.embed_dim)
        self.pos_embed.data.copy_(pos_embed)
        if self.sparse:
            torch.nn.init.normal_(self.mask_token, std=.02)
        # initialize nn.Linear and nn.LayerNorm
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            # we use xavier_uniform following official JAX ViT:
            torch.nn.init.xavier_uniform_(m.weight)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def random_masking(self, enc, active_b1fff):
        """

        Perform per-sample random masking by per-sample shuffling.

        Per-sample shuffling is done by argsort random noise.

        x: [N, L, D], sequence

        """
        N, L, D = enc.shape  # batch, length, dim
        mask = torch.tensor(active_b1fff, dtype=torch.int).flatten(2).transpose(1, 2)
        # sort noise for each sample
        noise = 1 - mask
        len_keep = torch.sum(mask)
        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
        ids_restore = torch.argsort(ids_shuffle, dim=1)

        # keep the first subset
        ids_keep = ids_shuffle[:, :len_keep]
        x_masked = torch.gather(enc, dim=1, index=ids_keep.repeat(1, 1, D))

        # generate the binary mask: 0 is keep, 1 is remove
        return x_masked, mask, ids_restore

    def unmasking(self, x, ids_restore):
        mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] - x.shape[1], 1)
        x_ = torch.cat([x, mask_tokens], dim=1)  # no cls token
        x = torch.gather(x_, dim=1, index=ids_restore.repeat(1, 1, x.shape[2]))  # unshuffle
        return x

    def forward_encoder(self, enc, active_b1fff=None):
        # enc = self.gsc(enc)
        B, C, H, W, D = enc.shape
        x = enc.flatten(2).transpose(1, 2)
        # add pos embed w/o cls token
        x = x + self.pos_embed
        if self.sparse:
            # masking: length -> length * mask_ratio
            x, mask, ids_restore = self.random_masking(x, active_b1fff)
            # apply Mamba blocks
            for blk in self.blocks:
                x = blk(x)
            x = self.unmasking(x, ids_restore)
        else:
            for blk in self.blocks:
                x = blk(x)
        x = x.transpose(1, 2).reshape(B, C, H, W, D)
        return x

    def forward(self, imgs, active_b1fff=None):
        return self.forward_encoder(imgs, active_b1fff)


class MedNeXtBlock(nn.Module):
    def __init__(self,

                 in_channels: int,

                 out_channels: int,

                 exp_r: int = 4,

                 kernel_size: int = 7,

                 do_res: int = True,

                 n_groups: int or None = None,

                 sparse=False):

        super().__init__()

        self.do_res = do_res
        self.sparse = sparse
        conv = nn.Conv3d

        # First convolution layer with DepthWise Convolutions
        self.conv1 = conv(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            stride=1,
            padding=kernel_size // 2,
            groups=in_channels if n_groups is None else n_groups,
        )

        # Normalization Layer. GroupNorm is used by default.

        self.norm = SparseConvNeXtLayerNorm(normalized_shape=in_channels, data_format='channels_first', sparse=sparse)

        # Second convolution (Expansion) layer with Conv3D 1x1x1
        self.conv2 = conv(
            in_channels=in_channels,
            out_channels=exp_r * in_channels,
            kernel_size=1,
            stride=1,
            padding=0
        )

        # GeLU activations
        self.act = nn.GELU()

        # Third convolution (Compression) layer with Conv3D 1x1x1
        self.conv3 = conv(
            in_channels=exp_r * in_channels,
            out_channels=out_channels,
            kernel_size=1,
            stride=1,
            padding=0
        )

    def forward(self, x, dummy_tensor=None):

        x1 = x
        x1 = self.conv1(x1)
        x1 = self.act(self.conv2(self.norm(x1)))
        x1 = self.conv3(x1)
        if self.sparse:
            x1 *= _get_active_ex_or_ii(H=x1.shape[2], W=x1.shape[3], D=x1.shape[4], returning_active_ex=True)
        if self.do_res:
            x1 = x + x1
        return x1


class MedNeXtDownBlock(MedNeXtBlock):

    def __init__(self, in_channels, out_channels, exp_r=4, kernel_size=7,

                 do_res=False, sparse=False):

        super().__init__(in_channels, out_channels, exp_r, kernel_size,
                         do_res=False, sparse=sparse)

        self.resample_do_res = do_res
        if do_res:
            self.res_conv = nn.Conv3d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                stride=2
            )

        self.conv1 = nn.Conv3d(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            stride=2,
            padding=kernel_size // 2,
            groups=in_channels,
        )

    def forward(self, x, dummy_tensor=None):

        x1 = super().forward(x)
        if self.resample_do_res:
            res = self.res_conv(x)
            x1 = x1 + res

        return x1


class UnetResBlock(nn.Module):
    """

    A skip-connection based module that can be used for DynUNet, based on:

    `Automated Design of Deep Learning Methods for Biomedical Image Segmentation <https://arxiv.org/abs/1904.08128>`_.

    `nnU-Net: Self-adapting Framework for U-Net-Based Medical Image Segmentation <https://arxiv.org/abs/1809.10486>`_.



    Args:

        spatial_dims: number of spatial dimensions.

        in_channels: number of input channels.

        out_channels: number of output channels.

        kernel_size: convolution kernel size.

        stride: convolution stride.

        norm_name: feature normalization type and arguments.

        act_name: activation layer type and arguments.

        dropout: dropout probability.



    """

    def __init__(

            self,

            sparse: bool,

            in_channels: int,

            out_channels: int,

            kernel_size: Union[Sequence[int], int],

            stride: Union[Sequence[int], int],

    ):
        super().__init__()
        self.conv1 = nn.Conv3d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=kernel_size // 2)
        self.conv2 = nn.Conv3d(
            out_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=1,
            padding=kernel_size // 2,
        )
        self.lrelu = nn.LeakyReLU(inplace=True, negative_slope=0.01)
        self.norm1 = SparseConvNeXtLayerNorm(normalized_shape=out_channels, data_format='channels_first', sparse=sparse)
        self.norm2 = SparseConvNeXtLayerNorm(normalized_shape=out_channels, data_format='channels_first', sparse=sparse)
        self.downsample = in_channels != out_channels
        stride_np = np.atleast_1d(stride)
        if not np.all(stride_np == 1):
            self.downsample = True
        if self.downsample:
            self.conv3 = nn.Conv3d(
                in_channels,
                out_channels,
                kernel_size=1,
                stride=stride)
            self.norm3 = SparseConvNeXtLayerNorm(normalized_shape=out_channels, data_format='channels_first', sparse=sparse)

    def forward(self, inp):
        residual = inp
        out = self.conv1(inp)
        out = self.norm1(out)
        out = self.lrelu(out)
        out = self.conv2(out)
        out = self.norm2(out)
        if hasattr(self, "conv3"):
            residual = self.conv3(residual)
        if hasattr(self, "norm3"):
            residual = self.norm3(residual)
        out += residual
        out = self.lrelu(out)
        return out


class MedNeXtUpBlock(MedNeXtBlock):

    def __init__(self, in_channels, out_channels, exp_r=4, kernel_size=3,

                 do_res=True, sparse=False):
        super().__init__(in_channels, out_channels, exp_r, kernel_size,
                         do_res=False, sparse=sparse)

        self.resample_do_res = do_res

        conv = nn.ConvTranspose3d
        if do_res:
            self.res_conv = conv(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
                stride=2
            )

        self.conv1 = conv(
            in_channels=in_channels,
            out_channels=in_channels,
            kernel_size=kernel_size,
            stride=2,
            padding=kernel_size // 2,
            groups=in_channels,
        )

    def forward(self, x, dummy_tensor=None):

        x1 = super().forward(x)
        # Asymmetry but necessary to match shape
        x1 = torch.nn.functional.pad(x1, (1, 0, 1, 0, 1, 0))

        if self.resample_do_res:
            res = self.res_conv(x)
        res = torch.nn.functional.pad(res, (1, 0, 1, 0, 1, 0))
        x1 = x1 + res
        return x1


class UnetOutBlock(nn.Module):
    def __init__(self, in_channels: int, n_classes: int):
        super().__init__()
        self.conv = nn.Conv3d(
            in_channels,
            n_classes,
            kernel_size=1,
            stride=1,
            bias=True,
        )

    def forward(self, inp):
        return self.conv(inp)


class Embeddings(nn.Module):
    def __init__(self,

                 in_channel: int = 3,

                 channels: Tuple = (32, 64, 96, 128, 192),

                 depths: Tuple = (1, 1, 3, 1, 1),

                 kernels: Tuple = (3, 3, 3, 3, 3),

                 exp_r: Tuple = (2, 4, 4, 4, 2),

                 sparse=True):
        super(Embeddings, self).__init__()
        self.dim = [channels[1], channels[2], channels[3], channels[4], channels[4]]
        self.stem = nn.Conv3d(in_channels=in_channel, out_channels=channels[0], kernel_size=3, stride=1, padding=1)


        self.layer2 = nn.Sequential(*[
            MedNeXtBlock(
                in_channels=channels[1],
                out_channels=channels[1],
                exp_r=exp_r[1],
                kernel_size=kernels[1],
                do_res=True,
                sparse=sparse
            )
            for i in range(depths[1])])

        self.layer3 = nn.Sequential(*[
            MedNeXtBlock(
                in_channels=channels[2],
                out_channels=channels[2],
                exp_r=exp_r[2],
                kernel_size=kernels[2],
                do_res=True,
                sparse=sparse
            )
            for i in range(depths[2])])

        self.layer4 = nn.Sequential(*[
            MedNeXtBlock(
                in_channels=channels[3],
                out_channels=channels[3],
                exp_r=exp_r[3],
                kernel_size=kernels[3],
                do_res=True,
                sparse=sparse
            )
            for i in range(depths[3])])

        self.layer5 = nn.Sequential(*[
            MedNeXtBlock(
                in_channels=channels[4],
                out_channels=channels[4],
                exp_r=exp_r[4],
                kernel_size=kernels[4],
                do_res=True,
                sparse=sparse
            )
            for i in range(depths[4])])

        self.down = nn.MaxPool3d((2, 2, 2))
        self.expend1 = nn.Conv3d(in_channels=channels[0], out_channels=channels[1], kernel_size=3, stride=1, padding=1)
        self.expend2 = nn.Conv3d(in_channels=channels[1], out_channels=channels[2], kernel_size=3, stride=1, padding=1)
        self.expend3 = nn.Conv3d(in_channels=channels[2], out_channels=channels[3], kernel_size=3, stride=1, padding=1)
        self.expend4 = nn.Conv3d(in_channels=channels[3], out_channels=channels[4], kernel_size=3, stride=1, padding=1)

        self.encoder1 = UnetResBlock(
            in_channels=channels[1],
            out_channels=channels[1],
            kernel_size=3,
            stride=1,
            sparse=sparse
        )
        self.encoder2 = UnetResBlock(
            in_channels=channels[2],
            out_channels=channels[2],
            kernel_size=3,
            stride=1,
            sparse=sparse
        )
        self.encoder3 = UnetResBlock(
            in_channels=channels[3],
            out_channels=channels[3],
            kernel_size=3,
            stride=1,
            sparse=sparse
        )
        self.encoder4 = UnetResBlock(
            in_channels=channels[4],
            out_channels=channels[4],
            kernel_size=3,
            stride=1,
            sparse=sparse
        )



    def forward(self, x):
        x = self.stem(x)

        x1 = self.expend1(x)

        x = self.down(x1)
        x = self.layer2(x)
        x2 = self.expend2(x)

        x = self.down(x2)
        x = self.layer3(x)
        x3 = self.expend3(x)

        x = self.down(x3)
        x = self.layer4(x)
        x4 = self.expend4(x)

        x = self.down(x4)
        x5 = self.layer5(x)

        return self.encoder1(x1), self.encoder2(x2), self.encoder3(x3), self.encoder4(x4), x5


class Encoder(nn.Module):

    def __init__(self,

                 in_channel: int = 1,

                 channels=(32, 64, 128, 192, 384),

                 depths=(1, 2, 2, 2, 1),

                 kernels=(3, 3, 3, 3, 3),

                 exp_r=(2, 2, 4, 4, 4),

                 img_size=96,

                 depth=4,

                 norm_layer=nn.LayerNorm,

                 sparse=False):
        super(Encoder, self).__init__()
        self.dim = [channels[1], channels[2], channels[3], channels[4], channels[4]]

        self.embeddings = Embeddings(in_channel=in_channel,
                                     channels=channels,
                                     depths=depths,
                                     kernels=kernels,
                                     exp_r=exp_r,
                                     sparse=sparse)

        self.mae = MaskedAutoencoderMamba(
            img_size=img_size,
            downsample_rato=self.get_downsample_ratio(),
            embed_dim=channels[-1],
            depth=depth,
            norm_layer=norm_layer,
            sparse=sparse)

    def get_downsample_ratio(self) -> int:
        """

        This func would ONLY be used in `SparseEncoder's __init__` (see `pretrain/encoder.py`).



        :return: the TOTAL downsample ratio of the ConvNet.

        E.g., for a ResNet-50, this should return 32.

        """
        return 16

    def get_feature_map_channels(self) -> List[int]:
        """

        This func would ONLY be used in `SparseEncoder's __init__` (see `pretrain/encoder.py`).



        :return: a list of the number of channels of each feature map.

        E.g., for a ResNet-50, this should return [256, 512, 1024, 2048].

        """
        return self.dim

    def forward(self, x, active_b1fff=None):
        x1, x2, x3, x4, x5 = self.embeddings(x)
        _x5 = self.mae(x5, active_b1fff)
        return x1, x2, x3, x4, _x5


class Decoder(nn.Module):
    def __init__(self,

                 n_classes: int = 3,

                 channels: Tuple = (32, 64, 128, 196, 384),

                 norm_name = "instance",

                 res_block: bool = True):
        super(Decoder, self).__init__()

        self.decoder5 = UnetrUpBlock(
            spatial_dims=3,
            in_channels=channels[4],
            out_channels=channels[4],
            kernel_size=3,
            upsample_kernel_size=2,
            norm_name=norm_name,
            res_block=res_block,
        )
        self.decoder4 = UnetrUpBlock(
            spatial_dims=3,
            in_channels=channels[4],
            out_channels=channels[3],
            kernel_size=3,
            upsample_kernel_size=2,
            norm_name=norm_name,
            res_block=res_block,
        )
        self.decoder3 = UnetrUpBlock(
            spatial_dims=3,
            in_channels=channels[3],
            out_channels=channels[2],
            kernel_size=3,
            upsample_kernel_size=2,
            norm_name=norm_name,
            res_block=res_block,
        )
        self.decoder2 = UnetrUpBlock(
            spatial_dims=3,
            in_channels=channels[2],
            out_channels=channels[1],
            kernel_size=3,
            upsample_kernel_size=2,
            norm_name=norm_name,
            res_block=res_block,
        )
        self.decoder1 = UnetResBlock(
            in_channels=channels[1],
            out_channels=channels[0],
            kernel_size=3,
            stride=1,
            sparse=False
        )
        self.out = UnetOutBlock(in_channels=channels[0], n_classes=n_classes)

    def forward(self, x1, x2, x3, x4, x5):
        d4 = self.decoder5(x5, x4)
        d3 = self.decoder4(d4, x3)
        d2 = self.decoder3(d3, x2)
        d1 = self.decoder2(d2, x1)
        d0 = self.decoder1(d1)
        return self.out(d0)


class Hybird(nn.Module):
    def __init__(self,

                 in_channel: int = 3,

                 n_classes: int = 3,

                 channels: Tuple = (32, 64, 96, 128, 192),

                 depths: Tuple = (1, 1, 3, 3, 1),

                 kernels: Tuple = (3, 3, 3, 3, 3),

                 exp_r: Tuple = (2, 4, 4, 4, 2),

                 img_size=96,

                 depth=3,

                 norm_layer=nn.LayerNorm, ):
        super().__init__()
        self.embeddings = Embeddings(in_channel=in_channel,
                                     channels=channels,
                                     depths=depths,
                                     kernels=kernels,
                                     exp_r=exp_r,
                                     sparse=False)

        self.mae = MaskedAutoencoderMamba(
            img_size=img_size,
            downsample_rato=16,
            embed_dim=channels[-1],
            depth=depth,
            norm_layer=norm_layer,
            sparse=False)

        self.decoder = Decoder(
            n_classes=n_classes,
            channels=channels,
        )

    def forward(self, x):
        x1, x2, x3, x4, x5 = self.embeddings(x)
        x5 = self.mae(x5, None)
        return self.decoder(x1, x2, x3, x4, x5)


def build_hybird(in_channel=1, n_classes=14, img_size=96):
    return Hybird(in_channel=in_channel,
                  n_classes=n_classes,
                  channels=(32, 64, 128, 192, 384),
                  depths=(1, 2, 2, 2, 1),
                  kernels=(3, 3, 3, 3, 3),
                  exp_r=(2, 2, 4, 4, 4),
                  img_size=img_size,
                  depth=4)


if __name__ == '__main__':
    x = torch.rand((1, 1, 96, 96, 96))
    network = build_hybird()
    print(network(x).shape)