File size: 4,098 Bytes
b263aee bf40aa3 b263aee eb3b2c9 b263aee 003ab7d b263aee eb3b2c9 b263aee eb3b2c9 b263aee bf40aa3 b263aee bf40aa3 b263aee 47dc41d b263aee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
from typing import Dict, Literal, List, OrderedDict
import torch
from transformers.configuration_utils import PretrainedConfig
from optimum.exporters.onnx.model_configs import ViTOnnxConfig
from optimum.utils import DummyVisionInputGenerator
### modified from https://github.com/roboflow/rf-detr/blob/main/rfdetr/config.py
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
class RFDetrConfig(PretrainedConfig):
model_type = 'rf-detr'
def __init__(
self,
model_name: Literal['RFDETRBase, RFDETRLarge'] = 'RFDETRBase',
pretrained: bool = False,
out_feature_indexes: List[int] = [2, 5, 8, 11],
dec_layers: int = 3,
two_stage: bool = True,
bbox_reparam: bool = True,
lite_refpoint_refine: bool = True,
layer_norm: bool = True,
amp: bool = True,
num_classes: int = 90,
num_queries: int = 300,
device: Literal["cpu", "cuda", "mps"] = DEVICE,
resolution: int = 560,
group_detr: int = 13,
gradient_checkpointing: bool = False,
**kwargs
):
self.model_name = model_name
self.pretrained = pretrained
self.out_feature_indexes = out_feature_indexes
self.dec_layers = dec_layers
self.two_stage = two_stage
self.bbox_reparam = bbox_reparam
self.lite_refpoint_refine = lite_refpoint_refine
self.layer_norm = layer_norm
self.amp = amp
self.num_classes = num_classes
self.device = device
self.resolution = resolution
self.group_detr = group_detr
self.gradient_checkpointing = gradient_checkpointing
self.num_queries = num_queries
if self.model_name == 'RFDETRBase':
self.encoder = "dinov2_windowed_small"
self.hidden_dim = 256
self.sa_nheads = 8
self.ca_nheads = 16
self.dec_n_points = 2
self.projector_scale = ["P4"]
self.pretrain_weights = "rf-detr-base.pth"
elif self.model_name == 'RFDETRLarge':
self.encoder = "dinov2_windowed_base"
self.hidden_dim = 384
self.sa_nheads = 12
self.ca_nheads = 24
self.dec_n_points = 4
self.projector_scale = ["P3", "P5"]
self.pretrain_weights = "rf-detr-large.pth"
if not self.pretrained:
self.pretrain_weights = ""
super().__init__(**kwargs)
class RFDetrDummyInputGenerator(DummyVisionInputGenerator):
def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
if input_name == "pixel_mask":
return self.random_mask_tensor(
shape=[self.batch_size, self.height, self.width],
framework=framework,
dtype="bool",
)
else:
return self.random_float_tensor(
shape=[self.batch_size, self.num_channels, self.height, self.width],
framework=framework,
dtype=float_dtype,
)
class RFDetrOnnxConfig(ViTOnnxConfig):
DUMMY_INPUT_GENERATOR_CLASSES = (RFDetrDummyInputGenerator,)
@property
def inputs(self) -> Dict[str, Dict[int, str]]:
return OrderedDict(
{
"pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
"pixel_mask": {0: "batch_size", 1: "height", 2: "width"},
}
)
@property
def outputs(self) -> Dict[str, Dict[int, str]]:
common_outputs = super().outputs
if self.task == "object-detection":
common_outputs["logits"] = {0: "batch_size", 1: "num_queries", 2: "num_classes"}
common_outputs["pred_boxes"] = {0: "batch_size", 1: "num_queries", 2: "4"}
return common_outputs
__all__ = [
'RFDetrConfig',
'RFDetrOnnxConfig'
] |