Spaces:
Runtime error
Runtime error
| # Copyright (c) OpenMMLab. All rights reserved. | |
| from typing import Sequence | |
| import torch | |
| import torch.nn as nn | |
| from mmcv.cnn.bricks import DropPath, build_activation_layer, build_norm_layer | |
| from mmengine.model import BaseModule | |
| from mmpretrain.registry import MODELS | |
| from .base_backbone import BaseBackbone | |
| class PatchEmbed(nn.Module): | |
| """Patch Embedding module implemented by a layer of convolution. | |
| Input: tensor in shape [B, C, H, W] | |
| Output: tensor in shape [B, C, H/stride, W/stride] | |
| Args: | |
| patch_size (int): Patch size of the patch embedding. Defaults to 16. | |
| stride (int): Stride of the patch embedding. Defaults to 16. | |
| padding (int): Padding of the patch embedding. Defaults to 0. | |
| in_chans (int): Input channels. Defaults to 3. | |
| embed_dim (int): Output dimension of the patch embedding. | |
| Defaults to 768. | |
| norm_layer (module): Normalization module. Defaults to None (not use). | |
| """ | |
| def __init__(self, | |
| patch_size=16, | |
| stride=16, | |
| padding=0, | |
| in_chans=3, | |
| embed_dim=768, | |
| norm_layer=None): | |
| super().__init__() | |
| self.proj = nn.Conv2d( | |
| in_chans, | |
| embed_dim, | |
| kernel_size=patch_size, | |
| stride=stride, | |
| padding=padding) | |
| self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() | |
| def forward(self, x): | |
| x = self.proj(x) | |
| x = self.norm(x) | |
| return x | |
| class Pooling(nn.Module): | |
| """Pooling module. | |
| Args: | |
| pool_size (int): Pooling size. Defaults to 3. | |
| """ | |
| def __init__(self, pool_size=3): | |
| super().__init__() | |
| self.pool = nn.AvgPool2d( | |
| pool_size, | |
| stride=1, | |
| padding=pool_size // 2, | |
| count_include_pad=False) | |
| def forward(self, x): | |
| return self.pool(x) - x | |
| class Mlp(nn.Module): | |
| """Mlp implemented by with 1*1 convolutions. | |
| Input: Tensor with shape [B, C, H, W]. | |
| Output: Tensor with shape [B, C, H, W]. | |
| Args: | |
| in_features (int): Dimension of input features. | |
| hidden_features (int): Dimension of hidden features. | |
| out_features (int): Dimension of output features. | |
| act_cfg (dict): The config dict for activation between pointwise | |
| convolution. Defaults to ``dict(type='GELU')``. | |
| drop (float): Dropout rate. Defaults to 0.0. | |
| """ | |
| def __init__(self, | |
| in_features, | |
| hidden_features=None, | |
| out_features=None, | |
| act_cfg=dict(type='GELU'), | |
| drop=0.): | |
| super().__init__() | |
| out_features = out_features or in_features | |
| hidden_features = hidden_features or in_features | |
| self.fc1 = nn.Conv2d(in_features, hidden_features, 1) | |
| self.act = build_activation_layer(act_cfg) | |
| self.fc2 = nn.Conv2d(hidden_features, out_features, 1) | |
| self.drop = nn.Dropout(drop) | |
| def forward(self, x): | |
| x = self.fc1(x) | |
| x = self.act(x) | |
| x = self.drop(x) | |
| x = self.fc2(x) | |
| x = self.drop(x) | |
| return x | |
| class PoolFormerBlock(BaseModule): | |
| """PoolFormer Block. | |
| Args: | |
| dim (int): Embedding dim. | |
| pool_size (int): Pooling size. Defaults to 3. | |
| mlp_ratio (float): Mlp expansion ratio. Defaults to 4. | |
| norm_cfg (dict): The config dict for norm layers. | |
| Defaults to ``dict(type='GN', num_groups=1)``. | |
| act_cfg (dict): The config dict for activation between pointwise | |
| convolution. Defaults to ``dict(type='GELU')``. | |
| drop (float): Dropout rate. Defaults to 0. | |
| drop_path (float): Stochastic depth rate. Defaults to 0. | |
| layer_scale_init_value (float): Init value for Layer Scale. | |
| Defaults to 1e-5. | |
| """ | |
| def __init__(self, | |
| dim, | |
| pool_size=3, | |
| mlp_ratio=4., | |
| norm_cfg=dict(type='GN', num_groups=1), | |
| act_cfg=dict(type='GELU'), | |
| drop=0., | |
| drop_path=0., | |
| layer_scale_init_value=1e-5): | |
| super().__init__() | |
| self.norm1 = build_norm_layer(norm_cfg, dim)[1] | |
| self.token_mixer = Pooling(pool_size=pool_size) | |
| self.norm2 = build_norm_layer(norm_cfg, dim)[1] | |
| mlp_hidden_dim = int(dim * mlp_ratio) | |
| self.mlp = Mlp( | |
| in_features=dim, | |
| hidden_features=mlp_hidden_dim, | |
| act_cfg=act_cfg, | |
| drop=drop) | |
| # The following two techniques are useful to train deep PoolFormers. | |
| self.drop_path = DropPath(drop_path) if drop_path > 0. \ | |
| else nn.Identity() | |
| self.layer_scale_1 = nn.Parameter( | |
| layer_scale_init_value * torch.ones((dim)), requires_grad=True) | |
| self.layer_scale_2 = nn.Parameter( | |
| layer_scale_init_value * torch.ones((dim)), requires_grad=True) | |
| def forward(self, x): | |
| x = x + self.drop_path( | |
| self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * | |
| self.token_mixer(self.norm1(x))) | |
| x = x + self.drop_path( | |
| self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * | |
| self.mlp(self.norm2(x))) | |
| return x | |
| def basic_blocks(dim, | |
| index, | |
| layers, | |
| pool_size=3, | |
| mlp_ratio=4., | |
| norm_cfg=dict(type='GN', num_groups=1), | |
| act_cfg=dict(type='GELU'), | |
| drop_rate=.0, | |
| drop_path_rate=0., | |
| layer_scale_init_value=1e-5): | |
| """ | |
| generate PoolFormer blocks for a stage | |
| return: PoolFormer blocks | |
| """ | |
| blocks = [] | |
| for block_idx in range(layers[index]): | |
| block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / ( | |
| sum(layers) - 1) | |
| blocks.append( | |
| PoolFormerBlock( | |
| dim, | |
| pool_size=pool_size, | |
| mlp_ratio=mlp_ratio, | |
| norm_cfg=norm_cfg, | |
| act_cfg=act_cfg, | |
| drop=drop_rate, | |
| drop_path=block_dpr, | |
| layer_scale_init_value=layer_scale_init_value, | |
| )) | |
| blocks = nn.Sequential(*blocks) | |
| return blocks | |
| class PoolFormer(BaseBackbone): | |
| """PoolFormer. | |
| A PyTorch implementation of PoolFormer introduced by: | |
| `MetaFormer is Actually What You Need for Vision <https://arxiv.org/abs/2111.11418>`_ | |
| Modified from the `official repo | |
| <https://github.com/sail-sg/poolformer/blob/main/models/poolformer.py>`. | |
| Args: | |
| arch (str | dict): The model's architecture. If string, it should be | |
| one of architecture in ``PoolFormer.arch_settings``. And if dict, it | |
| should include the following two keys: | |
| - layers (list[int]): Number of blocks at each stage. | |
| - embed_dims (list[int]): The number of channels at each stage. | |
| - mlp_ratios (list[int]): Expansion ratio of MLPs. | |
| - layer_scale_init_value (float): Init value for Layer Scale. | |
| Defaults to 'S12'. | |
| norm_cfg (dict): The config dict for norm layers. | |
| Defaults to ``dict(type='LN2d', eps=1e-6)``. | |
| act_cfg (dict): The config dict for activation between pointwise | |
| convolution. Defaults to ``dict(type='GELU')``. | |
| in_patch_size (int): The patch size of input image patch embedding. | |
| Defaults to 7. | |
| in_stride (int): The stride of input image patch embedding. | |
| Defaults to 4. | |
| in_pad (int): The padding of input image patch embedding. | |
| Defaults to 2. | |
| down_patch_size (int): The patch size of downsampling patch embedding. | |
| Defaults to 3. | |
| down_stride (int): The stride of downsampling patch embedding. | |
| Defaults to 2. | |
| down_pad (int): The padding of downsampling patch embedding. | |
| Defaults to 1. | |
| drop_rate (float): Dropout rate. Defaults to 0. | |
| drop_path_rate (float): Stochastic depth rate. Defaults to 0. | |
| out_indices (Sequence | int): Output from which network position. | |
| Index 0-6 respectively corresponds to | |
| [stage1, downsampling, stage2, downsampling, stage3, downsampling, stage4] | |
| Defaults to -1, means the last stage. | |
| frozen_stages (int): Stages to be frozen (all param fixed). | |
| Defaults to 0, which means not freezing any parameters. | |
| init_cfg (dict, optional): Initialization config dict | |
| """ # noqa: E501 | |
| # --layers: [x,x,x,x], numbers of layers for the four stages | |
| # --embed_dims, --mlp_ratios: | |
| # embedding dims and mlp ratios for the four stages | |
| # --downsamples: flags to apply downsampling or not in four blocks | |
| arch_settings = { | |
| 's12': { | |
| 'layers': [2, 2, 6, 2], | |
| 'embed_dims': [64, 128, 320, 512], | |
| 'mlp_ratios': [4, 4, 4, 4], | |
| 'layer_scale_init_value': 1e-5, | |
| }, | |
| 's24': { | |
| 'layers': [4, 4, 12, 4], | |
| 'embed_dims': [64, 128, 320, 512], | |
| 'mlp_ratios': [4, 4, 4, 4], | |
| 'layer_scale_init_value': 1e-5, | |
| }, | |
| 's36': { | |
| 'layers': [6, 6, 18, 6], | |
| 'embed_dims': [64, 128, 320, 512], | |
| 'mlp_ratios': [4, 4, 4, 4], | |
| 'layer_scale_init_value': 1e-6, | |
| }, | |
| 'm36': { | |
| 'layers': [6, 6, 18, 6], | |
| 'embed_dims': [96, 192, 384, 768], | |
| 'mlp_ratios': [4, 4, 4, 4], | |
| 'layer_scale_init_value': 1e-6, | |
| }, | |
| 'm48': { | |
| 'layers': [8, 8, 24, 8], | |
| 'embed_dims': [96, 192, 384, 768], | |
| 'mlp_ratios': [4, 4, 4, 4], | |
| 'layer_scale_init_value': 1e-6, | |
| }, | |
| } | |
| def __init__(self, | |
| arch='s12', | |
| pool_size=3, | |
| norm_cfg=dict(type='GN', num_groups=1), | |
| act_cfg=dict(type='GELU'), | |
| in_patch_size=7, | |
| in_stride=4, | |
| in_pad=2, | |
| down_patch_size=3, | |
| down_stride=2, | |
| down_pad=1, | |
| drop_rate=0., | |
| drop_path_rate=0., | |
| out_indices=-1, | |
| frozen_stages=0, | |
| init_cfg=None): | |
| super().__init__(init_cfg=init_cfg) | |
| if isinstance(arch, str): | |
| assert arch in self.arch_settings, \ | |
| f'Unavailable arch, please choose from ' \ | |
| f'({set(self.arch_settings)}) or pass a dict.' | |
| arch = self.arch_settings[arch] | |
| elif isinstance(arch, dict): | |
| assert 'layers' in arch and 'embed_dims' in arch, \ | |
| f'The arch dict must have "layers" and "embed_dims", ' \ | |
| f'but got {list(arch.keys())}.' | |
| layers = arch['layers'] | |
| embed_dims = arch['embed_dims'] | |
| mlp_ratios = arch['mlp_ratios'] \ | |
| if 'mlp_ratios' in arch else [4, 4, 4, 4] | |
| layer_scale_init_value = arch['layer_scale_init_value'] \ | |
| if 'layer_scale_init_value' in arch else 1e-5 | |
| self.patch_embed = PatchEmbed( | |
| patch_size=in_patch_size, | |
| stride=in_stride, | |
| padding=in_pad, | |
| in_chans=3, | |
| embed_dim=embed_dims[0]) | |
| # set the main block in network | |
| network = [] | |
| for i in range(len(layers)): | |
| stage = basic_blocks( | |
| embed_dims[i], | |
| i, | |
| layers, | |
| pool_size=pool_size, | |
| mlp_ratio=mlp_ratios[i], | |
| norm_cfg=norm_cfg, | |
| act_cfg=act_cfg, | |
| drop_rate=drop_rate, | |
| drop_path_rate=drop_path_rate, | |
| layer_scale_init_value=layer_scale_init_value) | |
| network.append(stage) | |
| if i >= len(layers) - 1: | |
| break | |
| if embed_dims[i] != embed_dims[i + 1]: | |
| # downsampling between two stages | |
| network.append( | |
| PatchEmbed( | |
| patch_size=down_patch_size, | |
| stride=down_stride, | |
| padding=down_pad, | |
| in_chans=embed_dims[i], | |
| embed_dim=embed_dims[i + 1])) | |
| self.network = nn.ModuleList(network) | |
| if isinstance(out_indices, int): | |
| out_indices = [out_indices] | |
| assert isinstance(out_indices, Sequence), \ | |
| f'"out_indices" must by a sequence or int, ' \ | |
| f'get {type(out_indices)} instead.' | |
| for i, index in enumerate(out_indices): | |
| if index < 0: | |
| out_indices[i] = 7 + index | |
| assert out_indices[i] >= 0, f'Invalid out_indices {index}' | |
| self.out_indices = out_indices | |
| if self.out_indices: | |
| for i_layer in self.out_indices: | |
| layer = build_norm_layer(norm_cfg, | |
| embed_dims[(i_layer + 1) // 2])[1] | |
| layer_name = f'norm{i_layer}' | |
| self.add_module(layer_name, layer) | |
| self.frozen_stages = frozen_stages | |
| self._freeze_stages() | |
| def forward_embeddings(self, x): | |
| x = self.patch_embed(x) | |
| return x | |
| def forward_tokens(self, x): | |
| outs = [] | |
| for idx, block in enumerate(self.network): | |
| x = block(x) | |
| if idx in self.out_indices: | |
| norm_layer = getattr(self, f'norm{idx}') | |
| x_out = norm_layer(x) | |
| outs.append(x_out) | |
| return tuple(outs) | |
| def forward(self, x): | |
| # input embedding | |
| x = self.forward_embeddings(x) | |
| # through backbone | |
| x = self.forward_tokens(x) | |
| return x | |
| def _freeze_stages(self): | |
| if self.frozen_stages >= 0: | |
| self.patch_embed.eval() | |
| for param in self.patch_embed.parameters(): | |
| param.requires_grad = False | |
| for i in range(self.frozen_stages): | |
| # Include both block and downsample layer. | |
| module = self.network[i] | |
| module.eval() | |
| for param in module.parameters(): | |
| param.requires_grad = False | |
| if i in self.out_indices: | |
| norm_layer = getattr(self, f'norm{i}') | |
| norm_layer.eval() | |
| for param in norm_layer.parameters(): | |
| param.requires_grad = False | |
| def train(self, mode=True): | |
| super(PoolFormer, self).train(mode) | |
| self._freeze_stages() | |