|
|
|
num_things_classes = 100 |
|
num_stuff_classes = 50 |
|
num_classes = num_things_classes + num_stuff_classes |
|
norm_cfg = dict(type='SyncBN', requires_grad=True) |
|
model = dict( |
|
type='EncoderDecoderMask2Former', |
|
pretrained=None, |
|
backbone=dict( |
|
type='XCiT', |
|
patch_size=16, |
|
embed_dim=384, |
|
depth=12, |
|
num_heads=8, |
|
mlp_ratio=4, |
|
qkv_bias=True, |
|
use_abs_pos_emb=True, |
|
use_rel_pos_bias=False, |
|
), |
|
decode_head=dict( |
|
type='Mask2FormerHead', |
|
in_channels=[256, 512, 1024, 2048], |
|
|
|
feat_channels=256, |
|
out_channels=256, |
|
in_index=[0, 1, 2, 3], |
|
num_things_classes=num_things_classes, |
|
num_stuff_classes=num_stuff_classes, |
|
num_queries=100, |
|
num_transformer_feat_level=3, |
|
pixel_decoder=dict( |
|
type='MSDeformAttnPixelDecoder', |
|
num_outs=3, |
|
norm_cfg=dict(type='GN', num_groups=32), |
|
act_cfg=dict(type='ReLU'), |
|
encoder=dict( |
|
type='DetrTransformerEncoder', |
|
num_layers=6, |
|
transformerlayers=dict( |
|
type='BaseTransformerLayer', |
|
attn_cfgs=dict( |
|
type='MultiScaleDeformableAttention', |
|
embed_dims=256, |
|
num_heads=8, |
|
num_levels=3, |
|
num_points=4, |
|
im2col_step=64, |
|
dropout=0.0, |
|
batch_first=False, |
|
norm_cfg=None, |
|
init_cfg=None), |
|
ffn_cfgs=dict( |
|
type='FFN', |
|
embed_dims=256, |
|
feedforward_channels=1024, |
|
num_fcs=2, |
|
ffn_drop=0.0, |
|
act_cfg=dict(type='ReLU', inplace=True)), |
|
operation_order=('self_attn', 'norm', 'ffn', 'norm')), |
|
init_cfg=None), |
|
positional_encoding=dict( |
|
type='SinePositionalEncoding', num_feats=128, normalize=True), |
|
init_cfg=None), |
|
enforce_decoder_input_project=False, |
|
positional_encoding=dict( |
|
type='SinePositionalEncoding', num_feats=128, normalize=True), |
|
transformer_decoder=dict( |
|
type='DetrTransformerDecoder', |
|
return_intermediate=True, |
|
num_layers=9, |
|
transformerlayers=dict( |
|
type='DetrTransformerDecoderLayer', |
|
attn_cfgs=dict( |
|
type='MultiheadAttention', |
|
embed_dims=256, |
|
num_heads=8, |
|
attn_drop=0.0, |
|
proj_drop=0.0, |
|
dropout_layer=None, |
|
batch_first=False), |
|
ffn_cfgs=dict( |
|
embed_dims=256, |
|
feedforward_channels=2048, |
|
num_fcs=2, |
|
act_cfg=dict(type='ReLU', inplace=True), |
|
ffn_drop=0.0, |
|
dropout_layer=None, |
|
add_identity=True), |
|
feedforward_channels=2048, |
|
operation_order=('cross_attn', 'norm', 'self_attn', 'norm', |
|
'ffn', 'norm')), |
|
init_cfg=None), |
|
loss_cls=dict( |
|
type='CrossEntropyLoss', |
|
use_sigmoid=False, |
|
loss_weight=2.0, |
|
reduction='mean', |
|
class_weight=[1.0] * num_classes + [0.1]), |
|
loss_mask=dict( |
|
type='CrossEntropyLoss', |
|
use_sigmoid=True, |
|
reduction='mean', |
|
loss_weight=5.0), |
|
loss_dice=dict( |
|
type='DiceLoss', |
|
use_sigmoid=True, |
|
activate=True, |
|
reduction='mean', |
|
naive_dice=True, |
|
eps=1.0, |
|
loss_weight=5.0)), |
|
train_cfg=dict( |
|
num_points=12544, |
|
oversample_ratio=3.0, |
|
importance_sample_ratio=0.75, |
|
assigner=dict( |
|
type='MaskHungarianAssigner', |
|
cls_cost=dict(type='ClassificationCost', weight=2.0), |
|
mask_cost=dict( |
|
type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), |
|
dice_cost=dict( |
|
type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), |
|
sampler=dict(type='MaskPseudoSampler')), |
|
test_cfg=dict( |
|
panoptic_on=True, |
|
|
|
|
|
semantic_on=False, |
|
instance_on=True, |
|
|
|
max_per_image=100, |
|
iou_thr=0.8, |
|
|
|
|
|
filter_low_score=True), |
|
init_cfg=None) |
|
|
|
|
|
|