# model_cfg num_things_classes = 100 num_stuff_classes = 50 num_classes = num_things_classes + num_stuff_classes norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoderMask2Former', pretrained=None, backbone=dict( type='XCiT', patch_size=16, embed_dim=384, depth=12, num_heads=8, mlp_ratio=4, qkv_bias=True, use_abs_pos_emb=True, use_rel_pos_bias=False, ), decode_head=dict( type='Mask2FormerHead', in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside # strides=[4, 8, 16, 32], feat_channels=256, out_channels=256, in_index=[0, 1, 2, 3], num_things_classes=num_things_classes, num_stuff_classes=num_stuff_classes, num_queries=100, num_transformer_feat_level=3, pixel_decoder=dict( type='MSDeformAttnPixelDecoder', num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_heads=8, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), init_cfg=None), enforce_decoder_input_project=False, positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, normalize=True), transformer_decoder=dict( type='DetrTransformerDecoder', return_intermediate=True, num_layers=9, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=dict( type='MultiheadAttention', embed_dims=256, num_heads=8, attn_drop=0.0, proj_drop=0.0, dropout_layer=None, batch_first=False), ffn_cfgs=dict( embed_dims=256, feedforward_channels=2048, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.0, dropout_layer=None, add_identity=True), feedforward_channels=2048, operation_order=('cross_attn', 'norm', 'self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0, reduction='mean', class_weight=[1.0] * num_classes + [0.1]), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=5.0), loss_dice=dict( type='DiceLoss', use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=5.0)), train_cfg=dict( num_points=12544, oversample_ratio=3.0, importance_sample_ratio=0.75, assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=2.0), mask_cost=dict( type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), dice_cost=dict( type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), sampler=dict(type='MaskPseudoSampler')), test_cfg=dict( panoptic_on=True, # For now, the dataset does not support # evaluating semantic segmentation metric. semantic_on=False, instance_on=True, # max_per_image is for instance segmentation. max_per_image=100, iou_thr=0.8, # In Mask2Former's panoptic postprocessing, # it will filter mask area where score is less than 0.5 . filter_low_score=True), init_cfg=None) # find_unused_parameters = True