File size: 4,098 Bytes

b263aee
 
 
 
 
bf40aa3
b263aee
 
 
eb3b2c9
b263aee
 
 
 
 
 
 
003ab7d
b263aee
 
 
 
 
 
 
 
 
eb3b2c9
b263aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb3b2c9
b263aee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf40aa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b263aee
bf40aa3
 
b263aee
 
 
 
 
47dc41d
b263aee

from typing import Dict, Literal, List, OrderedDict

import torch
from transformers.configuration_utils import PretrainedConfig
from optimum.exporters.onnx.model_configs import ViTOnnxConfig
from optimum.utils import DummyVisionInputGenerator

### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py

DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

class RFDetrConfig(PretrainedConfig):
    model_type = 'rf-detr'

    def __init__(

        self,

        model_name: Literal['RFDETRBase, RFDETRLarge'] = 'RFDETRBase',

        pretrained: bool = False,

        out_feature_indexes: List[int] = [2, 5, 8, 11],

        dec_layers: int = 3,

        two_stage: bool = True,

        bbox_reparam: bool = True,

        lite_refpoint_refine: bool = True,

        layer_norm: bool = True,

        amp: bool = True,

        num_classes: int = 90,

        num_queries: int  = 300,

        device: Literal["cpu", "cuda", "mps"] = DEVICE,

        resolution: int = 560,

        group_detr: int = 13,

        gradient_checkpointing: bool = False,

        **kwargs

    ):
        self.model_name = model_name
        self.pretrained = pretrained
        self.out_feature_indexes = out_feature_indexes
        self.dec_layers = dec_layers
        self.two_stage = two_stage
        self.bbox_reparam = bbox_reparam
        self.lite_refpoint_refine = lite_refpoint_refine
        self.layer_norm = layer_norm
        self.amp = amp
        self.num_classes = num_classes
        self.device = device
        self.resolution = resolution
        self.group_detr = group_detr
        self.gradient_checkpointing = gradient_checkpointing
        self.num_queries = num_queries
        if self.model_name == 'RFDETRBase':
            self.encoder = "dinov2_windowed_small"
            self.hidden_dim = 256
            self.sa_nheads = 8
            self.ca_nheads = 16
            self.dec_n_points = 2
            self.projector_scale = ["P4"]
            self.pretrain_weights = "rf-detr-base.pth"
        elif self.model_name == 'RFDETRLarge':
            self.encoder = "dinov2_windowed_base"
            self.hidden_dim = 384
            self.sa_nheads = 12
            self.ca_nheads = 24
            self.dec_n_points = 4
            self.projector_scale = ["P3", "P5"]
            self.pretrain_weights = "rf-detr-large.pth"
        if not self.pretrained:
            self.pretrain_weights = ""
        super().__init__(**kwargs)


class RFDetrDummyInputGenerator(DummyVisionInputGenerator):
    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
        if input_name == "pixel_mask":
            return self.random_mask_tensor(
                shape=[self.batch_size, self.height, self.width],
                framework=framework,
                dtype="bool",
            )
        else:
            return self.random_float_tensor(
                shape=[self.batch_size, self.num_channels, self.height, self.width],
                framework=framework,
                dtype=float_dtype,
            )


class RFDetrOnnxConfig(ViTOnnxConfig):
    DUMMY_INPUT_GENERATOR_CLASSES = (RFDetrDummyInputGenerator,)
    
    @property
    def inputs(self) -> Dict[str, Dict[int, str]]:
        return OrderedDict(
            {
                "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
                "pixel_mask": {0: "batch_size", 1: "height", 2: "width"},
            }
        )

    @property
    def outputs(self) -> Dict[str, Dict[int, str]]:
        common_outputs = super().outputs 

        if self.task == "object-detection":
            common_outputs["logits"] = {0: "batch_size", 1: "num_queries", 2: "num_classes"}
            common_outputs["pred_boxes"] = {0: "batch_size", 1: "num_queries", 2: "4"}
        
        return common_outputs
    

__all__ = [
    'RFDetrConfig',
    'RFDetrOnnxConfig'
]