Spaces:

chlab
/

interactive_kinematic_planet_detector

Sleeping

App Files Files Community

jpterry commited on Jun 16, 2022

Commit

68bc350

1 Parent(s): bc0a3bf

updates

Browse files

Files changed (1) hide show

app.py +718 -715

app.py CHANGED Viewed

@@ -73,843 +73,846 @@ effnet_hparams = {61: {
 activation_indices = {'efficientnet': [0, 3]}
-########## EfficientNet ############
-@dataclass
-class _MBConvConfig:
-    expand_ratio: float
-    kernel: int
-    stride: int
-    input_channels: int
-    out_channels: int
-    num_layers: int
-    block: Callable[..., nn.Module]
-    @staticmethod
-    def adjust_channels(
-        channels: int, width_mult: float, min_value: Optional[int] = None
-    ) -> int:
-        return _make_divisible(channels * width_mult, 8, min_value)
-class MBConvConfig(_MBConvConfig):
-    # Stores information listed at Table 1 of the EfficientNet paper & Table 4 of the EfficientNetV2 paper
-    def __init__(
-        self,
-        expand_ratio: float,
-        kernel: int,
-        stride: int,
-        input_channels: int,
-        out_channels: int,
-        num_layers: int,
-        width_mult: float = 1.0,
-        depth_mult: float = 1.0,
-        block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        input_channels = self.adjust_channels(input_channels, width_mult)
-        out_channels = self.adjust_channels(out_channels, width_mult)
-        num_layers = self.adjust_depth(num_layers, depth_mult)
-        if block is None:
-            block = MBConv
-        super().__init__(
-            expand_ratio,
-            kernel,
-            stride,
-            input_channels,
-            out_channels,
-            num_layers,
-            block,
-        )
-    @staticmethod
-    def adjust_depth(num_layers: int, depth_mult: float):
-        return int(math.ceil(num_layers * depth_mult))
-class FusedMBConvConfig(_MBConvConfig):
-    # Stores information listed at Table 4 of the EfficientNetV2 paper
     def __init__(
         self,
-        expand_ratio: float,
-        kernel: int,
-        stride: int,
-        input_channels: int,
-        out_channels: int,
-        num_layers: int,
-        block: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        if block is None:
-            block = FusedMBConv
-        super().__init__(
-            expand_ratio,
-            kernel,
-            stride,
-            input_channels,
-            out_channels,
-            num_layers,
-            block,
         )
-class MBConv(nn.Module):
     def __init__(
         self,
-        cnf: MBConvConfig,
-        stochastic_depth_prob: float,
-        norm_layer: Callable[..., nn.Module],
-        se_layer: Callable[..., nn.Module] = SqueezeExcitation,
     ) -> None:
-        super().__init__()
-        if not (1 <= cnf.stride <= 2):
-            raise ValueError("illegal stride value")
-        self.use_res_connect = (
-            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
-        )
-        layers: List[nn.Module] = []
-        activation_layer = nn.SiLU
-        # expand
-        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
-        if expanded_channels != cnf.input_channels:
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    expanded_channels,
-                    kernel_size=1,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
             )
-        # depthwise
-        layers.append(
-            Conv2dNormActivation(
-                expanded_channels,
-                expanded_channels,
-                kernel_size=cnf.kernel,
-                stride=cnf.stride,
-                groups=expanded_channels,
-                norm_layer=norm_layer,
-                activation_layer=activation_layer,
-            )
-        )
-        # squeeze and excitation
-        squeeze_channels = max(1, cnf.input_channels // 4)
-        layers.append(
-            se_layer(
-                expanded_channels,
-                squeeze_channels,
-                activation=partial(nn.SiLU, inplace=True),
-            )
-        )
-        # project
-        layers.append(
-            Conv2dNormActivation(
-                expanded_channels,
-                cnf.out_channels,
-                kernel_size=1,
-                norm_layer=norm_layer,
-                activation_layer=None,
             )
-        )
-        self.block = nn.Sequential(*layers)
-        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
-        self.out_channels = cnf.out_channels
-    def forward(self, input: Tensor) -> Tensor:
-        result = self.block(input)
-        if self.use_res_connect:
-            result = self.stochastic_depth(result)
-            result += input
-        return result
-class FusedMBConv(nn.Module):
     def __init__(
         self,
-        cnf: FusedMBConvConfig,
-        stochastic_depth_prob: float,
-        norm_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-        if not (1 <= cnf.stride <= 2):
-            raise ValueError("illegal stride value")
-        self.use_res_connect = (
-            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
         )
-        layers: List[nn.Module] = []
-        activation_layer = nn.SiLU
-        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
-        if expanded_channels != cnf.input_channels:
-            # fused expand
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    expanded_channels,
-                    kernel_size=cnf.kernel,
-                    stride=cnf.stride,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
-            )
-            # project
-            layers.append(
-                Conv2dNormActivation(
-                    expanded_channels,
-                    cnf.out_channels,
-                    kernel_size=1,
-                    norm_layer=norm_layer,
-                    activation_layer=None,
-                )
-            )
-        else:
-            layers.append(
-                Conv2dNormActivation(
-                    cnf.input_channels,
-                    cnf.out_channels,
-                    kernel_size=cnf.kernel,
-                    stride=cnf.stride,
-                    norm_layer=norm_layer,
-                    activation_layer=activation_layer,
-                )
-            )
-        self.block = nn.Sequential(*layers)
-        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
-        self.out_channels = cnf.out_channels
-    def forward(self, input: Tensor) -> Tensor:
-        result = self.block(input)
-        if self.use_res_connect:
-            result = self.stochastic_depth(result)
-            result += input
-        return result
-class EfficientNetConfig(PretrainedConfig):
-    model_type = "efficientnet"
     def __init__(
         self,
-        # inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
-        dropout: float=0.25,
-        num_channels: int = 61,
-        stochastic_depth_prob: float = 0.2,
-        num_classes: int = 2,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-        # last_channel: Optional[int] = None,
-        size: str='v2_s',
-        width_mult: float = 1.0,
-        depth_mult: float = 1.0,
-        **kwargs: Any,
     ) -> None:
-        """
-        EfficientNet V1 and V2 main class
-        Args:
-            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
-            dropout (float): The droupout probability
-            stochastic_depth_prob (float): The stochastic depth probability
-            num_classes (int): Number of classes
-            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
-            last_channel (int): The number of channels on the penultimate layer
-        """
-        # self.model = EfficientNet(
-        #                             dropout=dropout,
-        #                             num_channels=num_channels,
-        #                             num_classes=num_classes,
-        #                             size=size,
-        #                             stochastic_depth_prob=stochastic_depth_prob,
-        #                             width_mult=width_mult,
-        #                             depth_mult=depth_mult,
-        # )
-        #
-        self.dropout=dropout
-        self.num_channels=num_channels
-        self.num_classes=num_classes
-        self.size=size
-        self.stochastic_depth_prob=stochastic_depth_prob
-        self.width_mult=width_mult
-        self.depth_mult=depth_mult
-        super().__init__(**kwargs)
-class EfficientNetPreTrained(PreTrainedModel):
-    config_class = EfficientNetConfig
-    def __init__(
-        self,
-        config
-    ):
-        super().__init__(config)
-        self.model = EfficientNet(  dropout=config.dropout,
-                                    num_channels=config.num_channels,
-                                    num_classes=config.num_classes,
-                                    size=config.size,
-                                    stochastic_depth_prob=config.stochastic_depth_prob,
-                                    width_mult=config.width_mult,
-                                    depth_mult=config.depth_mult,)
-    def forward(self, tensor):
-        return self.model.forward(tensor)
-class EfficientNet(nn.Module):
     def __init__(
         self,
-        # inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
-        dropout: float=0.25,
-        num_channels: int = 61,
-        stochastic_depth_prob: float = 0.2,
-        num_classes: int = 2,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-        # last_channel: Optional[int] = None,
-        size: str='v2_s',
-        width_mult: float = 1.0,
-        depth_mult: float = 1.0,
-        **kwargs: Any,
     ) -> None:
-        """
-        EfficientNet V1 and V2 main class
-        Args:
-            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
-            dropout (float): The droupout probability
-            stochastic_depth_prob (float): The stochastic depth probability
-            num_classes (int): Number of classes
-            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
-            last_channel (int): The number of channels on the penultimate layer
-        """
         super().__init__()
         # _log_api_usage_once(self)
-        inverted_residual_setting, last_channel = _efficientnet_conf(
-                     "efficientnet_%s" % (size), width_mult=width_mult, depth_mult=depth_mult
-                    )
-        if not inverted_residual_setting:
-            raise ValueError("The inverted_residual_setting should not be empty")
-        elif not (
-            isinstance(inverted_residual_setting, Sequence)
-            and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
-        ):
-            raise TypeError(
-                "The inverted_residual_setting should be List[MBConvConfig]"
-            )
-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        layers: List[nn.Module] = []
-        # building first layer
-        firstconv_output_channels = inverted_residual_setting[0].input_channels
-        layers.append(
-            Conv2dNormActivation(
-                num_channels,
-                firstconv_output_channels,
-                kernel_size=3,
-                stride=2,
-                norm_layer=norm_layer,
-                activation_layer=nn.SiLU,
-            )
-        )
-        # building inverted residual blocks
-        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
-        stage_block_id = 0
-        for cnf in inverted_residual_setting:
-            stage: List[nn.Module] = []
-            for _ in range(cnf.num_layers):
-                # copy to avoid modifications. shallow copy is enough
-                block_cnf = copy.copy(cnf)
-                # overwrite info if not the first conv in the stage
-                if stage:
-                    block_cnf.input_channels = block_cnf.out_channels
-                    block_cnf.stride = 1
-                # adjust stochastic depth probability based on the depth of the stage block
-                sd_prob = (
-                    stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
-                )
-                stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
-                stage_block_id += 1
-            layers.append(nn.Sequential(*stage))
-        # building last several layers
-        lastconv_input_channels = inverted_residual_setting[-1].out_channels
-        lastconv_output_channels = (
-            last_channel if last_channel is not None else 4 * lastconv_input_channels
-        )
-        layers.append(
-            Conv2dNormActivation(
-                lastconv_input_channels,
-                lastconv_output_channels,
-                kernel_size=1,
-                norm_layer=norm_layer,
-                activation_layer=nn.SiLU,
-            )
-        )
-        self.features = nn.Sequential(*layers)
-        self.avgpool = nn.AdaptiveAvgPool2d(1)
-        self.classifier = nn.Sequential(
-            nn.Dropout(p=dropout, inplace=True),
-            nn.Linear(lastconv_output_channels, num_classes),
-        )
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out")
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                init_range = 1.0 / math.sqrt(m.out_features)
-                nn.init.uniform_(m.weight, -init_range, init_range)
-                nn.init.zeros_(m.bias)
-        # super().__init__(**kwargs)
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        x = self.features(x)
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.classifier(x)
-        return x
     def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-# def _efficientnet(
-#     inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
-#     dropout: float,
-#     last_channel: Optional[int],
-#     weights=None,
-#     num_channels: int = 61,
-#     stochastic_depth_prob: float = 0.2,
-#     progress: bool = True,
-#     num_classes: int = 2,
-#     **kwargs: Any,
-# ) -> EfficientNetCongig:
-#     model = EfficientNetCongif(
-#         inverted_residual_setting,
-#         dropout,
-#         num_classes=num_classes,
-#         num_channels=num_channels,
-#         stochastic_depth_prob=stochastic_depth_prob,
-#         last_channel=last_channel,
-#         **kwargs,
-#     )
-#     return model
-def _efficientnet_conf(
-    arch: str,
-    **kwargs: Any,
-) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
-    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
-    if arch.startswith("efficientnet_b"):
-        bneck_conf = partial(
-            MBConvConfig,
-            width_mult=kwargs.pop("width_mult"),
-            depth_mult=kwargs.pop("depth_mult"),
-        )
-        inverted_residual_setting = [
-            bneck_conf(1, 3, 1, 32, 16, 1),
-            bneck_conf(6, 3, 2, 16, 24, 2),
-            bneck_conf(6, 5, 2, 24, 40, 2),
-            bneck_conf(6, 3, 2, 40, 80, 3),
-            bneck_conf(6, 5, 1, 80, 112, 3),
-            bneck_conf(6, 5, 2, 112, 192, 4),
-            bneck_conf(6, 3, 1, 192, 320, 1),
-        ]
-        last_channel = None
-    elif arch.startswith("efficientnet_v2_s"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 24, 24, 2),
-            FusedMBConvConfig(4, 3, 2, 24, 48, 4),
-            FusedMBConvConfig(4, 3, 2, 48, 64, 4),
-            MBConvConfig(4, 3, 2, 64, 128, 6),
-            MBConvConfig(6, 3, 1, 128, 160, 9),
-            MBConvConfig(6, 3, 2, 160, 256, 15),
-        ]
-        last_channel = 1280
-    elif arch.startswith("efficientnet_v2_m"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 24, 24, 3),
-            FusedMBConvConfig(4, 3, 2, 24, 48, 5),
-            FusedMBConvConfig(4, 3, 2, 48, 80, 5),
-            MBConvConfig(4, 3, 2, 80, 160, 7),
-            MBConvConfig(6, 3, 1, 160, 176, 14),
-            MBConvConfig(6, 3, 2, 176, 304, 18),
-            MBConvConfig(6, 3, 1, 304, 512, 5),
-        ]
-        last_channel = 1280
-    elif arch.startswith("efficientnet_v2_l"):
-        inverted_residual_setting = [
-            FusedMBConvConfig(1, 3, 1, 32, 32, 4),
-            FusedMBConvConfig(4, 3, 2, 32, 64, 7),
-            FusedMBConvConfig(4, 3, 2, 64, 96, 7),
-            MBConvConfig(4, 3, 2, 96, 192, 10),
-            MBConvConfig(6, 3, 1, 192, 224, 19),
-            MBConvConfig(6, 3, 2, 224, 384, 25),
-            MBConvConfig(6, 3, 1, 384, 640, 7),
-        ]
-        last_channel = 1280
-    else:
-        raise ValueError(f"Unsupported model type {arch}")
-    return inverted_residual_setting, last_channel
-#### extra torchvision stuff ####
-class FrozenBatchNorm2d(torch.nn.Module):
-    """
-    BatchNorm2d where the batch statistics and the affine parameters are fixed
-    Args:
-        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
-        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
-    """
     def __init__(
         self,
-        num_features: int,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        # _log_api_usage_once(self)
-        self.eps = eps
-        self.register_buffer("weight", torch.ones(num_features))
-        self.register_buffer("bias", torch.zeros(num_features))
-        self.register_buffer("running_mean", torch.zeros(num_features))
-        self.register_buffer("running_var", torch.ones(num_features))
-    def _load_from_state_dict(
-        self,
-        state_dict: dict,
-        prefix: str,
-        local_metadata: dict,
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ):
-        num_batches_tracked_key = prefix + "num_batches_tracked"
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
-    def forward(self, x: Tensor) -> Tensor:
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        scale = w * (rv + self.eps).rsqrt()
-        bias = b - rm * scale
-        return x * scale + bias
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
-class ConvNormActivation(torch.nn.Sequential):
     def __init__(
         self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
-        conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
     ) -> None:
-        if padding is None:
-            padding = (kernel_size - 1) // 2 * dilation
-        if bias is None:
-            bias = norm_layer is None
-        layers = [
-            conv_layer(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-            )
-        ]
-        if norm_layer is not None:
-            layers.append(norm_layer(out_channels))
-        if activation_layer is not None:
-            params = {} if inplace is None else {"inplace": inplace}
-            layers.append(activation_layer(**params))
-        super().__init__(*layers)
-        # _log_api_usage_once(self)
-        self.out_channels = out_channels
-        if self.__class__ == ConvNormActivation:
-            warnings.warn(
-                "Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
             )
-class Conv2dNormActivation(ConvNormActivation):
-    """
-    Configurable block used for Convolution2d-Normalization-Activation blocks.
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
-        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
-        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-    """
     def __init__(
         self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
     ) -> None:
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            groups,
-            norm_layer,
-            activation_layer,
-            dilation,
-            inplace,
-            bias,
-            torch.nn.Conv2d,
         )
-class Conv3dNormActivation(ConvNormActivation):
-    """
-    Configurable block used for Convolution3d-Normalization-Activation blocks.
-    Args:
-        in_channels (int): Number of channels in the input video.
-        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
-        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
-        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-    """
     def __init__(
         self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
     ) -> None:
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            groups,
-            norm_layer,
-            activation_layer,
-            dilation,
-            inplace,
-            bias,
-            torch.nn.Conv3d,
         )
-class SqueezeExcitation(torch.nn.Module):
-    """
-    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
-    Args:
-        input_channels (int): Number of channels in the input image
-        squeeze_channels (int): Number of squeeze channels
-        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
-        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
-    """
-    def __init__(
-        self,
-        input_channels: int,
-        squeeze_channels: int,
-        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
-        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
-    ) -> None:
-        super().__init__()
-        # _log_api_usage_once(self)
-        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
-        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
-        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
-        self.activation = activation()
-        self.scale_activation = scale_activation()
-    def _scale(self, input: Tensor) -> Tensor:
-        scale = self.avgpool(input)
-        scale = self.fc1(scale)
-        scale = self.activation(scale)
-        scale = self.fc2(scale)
-        return self.scale_activation(scale)
-    def forward(self, input: Tensor) -> Tensor:
-        scale = self._scale(input)
-        return scale * input
-class MLP(torch.nn.Sequential):
-    """This block implements the multi-layer perceptron (MLP) module.
-    Args:
-        in_channels (int): Number of channels of the input
-        hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
-        bias (bool): Whether to use bias in the linear layer. Default ``True``
-        dropout (float): The probability for the dropout layer. Default: 0.0
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        hidden_channels: List[int],
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        inplace: Optional[bool] = True,
-        bias: bool = True,
-        dropout: float = 0.0,
-    ):
-        # The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
-        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
-        params = {} if inplace is None else {"inplace": inplace}
-        layers = []
-        in_dim = in_channels
-        for hidden_dim in hidden_channels[:-1]:
-            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
-            if norm_layer is not None:
-                layers.append(norm_layer(hidden_dim))
-            layers.append(activation_layer(**params))
-            layers.append(torch.nn.Dropout(dropout, **params))
-            in_dim = hidden_dim
-        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
-        layers.append(torch.nn.Dropout(dropout, **params))
-        super().__init__(*layers)
-        # _log_api_usage_once(self)
-class Permute(torch.nn.Module):
-    """This module returns a view of the tensor input with its dimensions permuted.
-    Args:
-        dims (List[int]): The desired ordering of dimensions
-    """
-    def __init__(self, dims: List[int]):
-        super().__init__()
-        self.dims = dims
-    def forward(self, x: Tensor) -> Tensor:
-        return torch.permute(x, self.dims)
 def normalize_array(x: list):

 activation_indices = {'efficientnet': [0, 3]}
+#### extra torchvision stuff ####
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed
+    Args:
+        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
+        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
+    """
     def __init__(
         self,
+        num_features: int,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        # _log_api_usage_once(self)
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features))
+    def _load_from_state_dict(
+        self,
+        state_dict: dict,
+        prefix: str,
+        local_metadata: dict,
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
+    def forward(self, x: Tensor) -> Tensor:
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        scale = w * (rv + self.eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
+class ConvNormActivation(torch.nn.Sequential):
     def __init__(
         self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: int = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
+        conv_layer: Callable[..., torch.nn.Module] = torch.nn.Conv2d,
     ) -> None:
+        if padding is None:
+            padding = (kernel_size - 1) // 2 * dilation
+        if bias is None:
+            bias = norm_layer is None
+        layers = [
+            conv_layer(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias=bias,
             )
+        ]
+        if norm_layer is not None:
+            layers.append(norm_layer(out_channels))
+        if activation_layer is not None:
+            params = {} if inplace is None else {"inplace": inplace}
+            layers.append(activation_layer(**params))
+        super().__init__(*layers)
+        # _log_api_usage_once(self)
+        self.out_channels = out_channels
+        if self.__class__ == ConvNormActivation:
+            warnings.warn(
+                "Don't use ConvNormActivation directly, please use Conv2dNormActivation and Conv3dNormActivation instead."
             )
+class Conv2dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution2d-Normalization-Activation blocks.
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
     def __init__(
         self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: int = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
     ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv2d,
         )
+class Conv3dNormActivation(ConvNormActivation):
+    """
+    Configurable block used for Convolution3d-Normalization-Activation blocks.
+    Args:
+        in_channels (int): Number of channels in the input video.
+        out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
+        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        dilation (int): Spacing between kernel elements. Default: 1
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
+    """
     def __init__(
         self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        groups: int = 1,
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm3d,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        dilation: int = 1,
+        inplace: Optional[bool] = True,
+        bias: Optional[bool] = None,
     ) -> None:
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups,
+            norm_layer,
+            activation_layer,
+            dilation,
+            inplace,
+            bias,
+            torch.nn.Conv3d,
+        )
+class SqueezeExcitation(torch.nn.Module):
+    """
+    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
+    Args:
+        input_channels (int): Number of channels in the input image
+        squeeze_channels (int): Number of squeeze channels
+        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
+        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
+    """
     def __init__(
         self,
+        input_channels: int,
+        squeeze_channels: int,
+        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
+        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
     ) -> None:
         super().__init__()
         # _log_api_usage_once(self)
+        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
+        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
+        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
+        self.activation = activation()
+        self.scale_activation = scale_activation()
+    def _scale(self, input: Tensor) -> Tensor:
+        scale = self.avgpool(input)
+        scale = self.fc1(scale)
+        scale = self.activation(scale)
+        scale = self.fc2(scale)
+        return self.scale_activation(scale)
+    def forward(self, input: Tensor) -> Tensor:
+        scale = self._scale(input)
+        return scale * input
+class MLP(torch.nn.Sequential):
+    """This block implements the multi-layer perceptron (MLP) module.
+    Args:
+        in_channels (int): Number of channels of the input
+        hidden_channels (List[int]): List of the hidden channel dimensions
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        bias (bool): Whether to use bias in the linear layer. Default ``True``
+        dropout (float): The probability for the dropout layer. Default: 0.0
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: List[int],
+        norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
+        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
+        inplace: Optional[bool] = True,
+        bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        # The addition of `norm_layer` is inspired from the implementation of TorchMultimodal:
+        # https://github.com/facebookresearch/multimodal/blob/5dec8a/torchmultimodal/modules/layers/mlp.py
+        params = {} if inplace is None else {"inplace": inplace}
+        layers = []
+        in_dim = in_channels
+        for hidden_dim in hidden_channels[:-1]:
+            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
+            if norm_layer is not None:
+                layers.append(norm_layer(hidden_dim))
+            layers.append(activation_layer(**params))
+            layers.append(torch.nn.Dropout(dropout, **params))
+            in_dim = hidden_dim
+        layers.append(torch.nn.Linear(in_dim, hidden_channels[-1], bias=bias))
+        layers.append(torch.nn.Dropout(dropout, **params))
+        super().__init__(*layers)
+        # _log_api_usage_once(self)
+class Permute(torch.nn.Module):
+    """This module returns a view of the tensor input with its dimensions permuted.
+    Args:
+        dims (List[int]): The desired ordering of dimensions
+    """
+    def __init__(self, dims: List[int]):
+        super().__init__()
+        self.dims = dims
     def forward(self, x: Tensor) -> Tensor:
+        return torch.permute(x, self.dims)
+########## EfficientNet ############
+@dataclass
+class _MBConvConfig:
+    expand_ratio: float
+    kernel: int
+    stride: int
+    input_channels: int
+    out_channels: int
+    num_layers: int
+    block: Callable[..., nn.Module]
+    @staticmethod
+    def adjust_channels(
+        channels: int, width_mult: float, min_value: Optional[int] = None
+    ) -> int:
+        return _make_divisible(channels * width_mult, 8, min_value)
+class MBConvConfig(_MBConvConfig):
+    # Stores information listed at Table 1 of the EfficientNet paper & Table 4 of the EfficientNetV2 paper
+    def __init__(
+        self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        width_mult: float = 1.0,
+        depth_mult: float = 1.0,
+        block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        input_channels = self.adjust_channels(input_channels, width_mult)
+        out_channels = self.adjust_channels(out_channels, width_mult)
+        num_layers = self.adjust_depth(num_layers, depth_mult)
+        if block is None:
+            block = MBConv
+        super().__init__(
+            expand_ratio,
+            kernel,
+            stride,
+            input_channels,
+            out_channels,
+            num_layers,
+            block,
+        )
+    @staticmethod
+    def adjust_depth(num_layers: int, depth_mult: float):
+        return int(math.ceil(num_layers * depth_mult))
+class FusedMBConvConfig(_MBConvConfig):
+    # Stores information listed at Table 4 of the EfficientNetV2 paper
     def __init__(
         self,
+        expand_ratio: float,
+        kernel: int,
+        stride: int,
+        input_channels: int,
+        out_channels: int,
+        num_layers: int,
+        block: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        if block is None:
+            block = FusedMBConv
+        super().__init__(
+            expand_ratio,
+            kernel,
+            stride,
+            input_channels,
+            out_channels,
+            num_layers,
+            block,
         )
+class MBConv(nn.Module):
     def __init__(
         self,
+        cnf: MBConvConfig,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., nn.Module],
+        se_layer: Callable[..., nn.Module] = SqueezeExcitation,
     ) -> None:
+        super().__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+        self.use_res_connect = (
+            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
+        )
+        layers: List[nn.Module] = []
+        activation_layer = nn.SiLU
+        # expand
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+        # depthwise
+        layers.append(
+            Conv2dNormActivation(
+                expanded_channels,
+                expanded_channels,
+                kernel_size=cnf.kernel,
+                stride=cnf.stride,
+                groups=expanded_channels,
+                norm_layer=norm_layer,
+                activation_layer=activation_layer,
             )
+        )
+        # squeeze and excitation
+        squeeze_channels = max(1, cnf.input_channels // 4)
+        layers.append(
+            se_layer(
+                expanded_channels,
+                squeeze_channels,
+                activation=partial(nn.SiLU, inplace=True),
+            )
+        )
+        # project
+        layers.append(
+            Conv2dNormActivation(
+                expanded_channels,
+                cnf.out_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=None,
+            )
+        )
+        self.block = nn.Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+class FusedMBConv(nn.Module):
     def __init__(
         self,
+        cnf: FusedMBConvConfig,
+        stochastic_depth_prob: float,
+        norm_layer: Callable[..., nn.Module],
     ) -> None:
+        super().__init__()
+        if not (1 <= cnf.stride <= 2):
+            raise ValueError("illegal stride value")
+        self.use_res_connect = (
+            cnf.stride == 1 and cnf.input_channels == cnf.out_channels
         )
+        layers: List[nn.Module] = []
+        activation_layer = nn.SiLU
+        expanded_channels = cnf.adjust_channels(cnf.input_channels, cnf.expand_ratio)
+        if expanded_channels != cnf.input_channels:
+            # fused expand
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    expanded_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+            # project
+            layers.append(
+                Conv2dNormActivation(
+                    expanded_channels,
+                    cnf.out_channels,
+                    kernel_size=1,
+                    norm_layer=norm_layer,
+                    activation_layer=None,
+                )
+            )
+        else:
+            layers.append(
+                Conv2dNormActivation(
+                    cnf.input_channels,
+                    cnf.out_channels,
+                    kernel_size=cnf.kernel,
+                    stride=cnf.stride,
+                    norm_layer=norm_layer,
+                    activation_layer=activation_layer,
+                )
+            )
+        self.block = nn.Sequential(*layers)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+        self.out_channels = cnf.out_channels
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.block(input)
+        if self.use_res_connect:
+            result = self.stochastic_depth(result)
+            result += input
+        return result
+class EfficientNetConfig(PretrainedConfig):
+    model_type = "efficientnet"
     def __init__(
         self,
+        # inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+        dropout: float=0.25,
+        num_channels: int = 61,
+        stochastic_depth_prob: float = 0.2,
+        num_classes: int = 2,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        # last_channel: Optional[int] = None,
+        size: str='v2_s',
+        width_mult: float = 1.0,
+        depth_mult: float = 1.0,
+        **kwargs: Any,
     ) -> None:
+        """
+        EfficientNet V1 and V2 main class
+        Args:
+            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
+            dropout (float): The droupout probability
+            stochastic_depth_prob (float): The stochastic depth probability
+            num_classes (int): Number of classes
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            last_channel (int): The number of channels on the penultimate layer
+        """
+        # self.model = EfficientNet(
+        #                             dropout=dropout,
+        #                             num_channels=num_channels,
+        #                             num_classes=num_classes,
+        #                             size=size,
+        #                             stochastic_depth_prob=stochastic_depth_prob,
+        #                             width_mult=width_mult,
+        #                             depth_mult=depth_mult,
+        # )
+        #
+        self.dropout=dropout
+        self.num_channels=num_channels
+        self.num_classes=num_classes
+        self.size=size
+        self.stochastic_depth_prob=stochastic_depth_prob
+        self.width_mult=width_mult
+        self.depth_mult=depth_mult
+        super().__init__(**kwargs)
+class EfficientNetPreTrained(PreTrainedModel):
+    config_class = EfficientNetConfig
+    def __init__(
+        self,
+        config
+    ):
+        super().__init__(config)
+        self.model = EfficientNet(  dropout=config.dropout,
+                                    num_channels=config.num_channels,
+                                    num_classes=config.num_classes,
+                                    size=config.size,
+                                    stochastic_depth_prob=config.stochastic_depth_prob,
+                                    width_mult=config.width_mult,
+                                    depth_mult=config.depth_mult,)
+    def forward(self, tensor):
+        return self.model.forward(tensor)
+class EfficientNet(nn.Module):
+    def __init__(
+        self,
+        # inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+        dropout: float=0.25,
+        num_channels: int = 61,
+        stochastic_depth_prob: float = 0.2,
+        num_classes: int = 2,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        # last_channel: Optional[int] = None,
+        size: str='v2_s',
+        width_mult: float = 1.0,
+        depth_mult: float = 1.0,
+        **kwargs: Any,
+    ) -> None:
+        """
+        EfficientNet V1 and V2 main class
+        Args:
+            inverted_residual_setting (Sequence[Union[MBConvConfig, FusedMBConvConfig]]): Network structure
+            dropout (float): The droupout probability
+            stochastic_depth_prob (float): The stochastic depth probability
+            num_classes (int): Number of classes
+            norm_layer (Optional[Callable[..., nn.Module]]): Module specifying the normalization layer to use
+            last_channel (int): The number of channels on the penultimate layer
+        """
+        super().__init__()
+        # _log_api_usage_once(self)
+        inverted_residual_setting, last_channel = _efficientnet_conf(
+                     "efficientnet_%s" % (size), width_mult=width_mult, depth_mult=depth_mult
+                    )
+        if not inverted_residual_setting:
+            raise ValueError("The inverted_residual_setting should not be empty")
+        elif not (
+            isinstance(inverted_residual_setting, Sequence)
+            and all([isinstance(s, _MBConvConfig) for s in inverted_residual_setting])
+        ):
+            raise TypeError(
+                "The inverted_residual_setting should be List[MBConvConfig]"
+            )
+        if "block" in kwargs:
+            warnings.warn(
+                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
+                "Please pass this information on 'MBConvConfig.block' instead."
+            )
+            if kwargs["block"] is not None:
+                for s in inverted_residual_setting:
+                    if isinstance(s, MBConvConfig):
+                        s.block = kwargs["block"]
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        layers: List[nn.Module] = []
+        # building first layer
+        firstconv_output_channels = inverted_residual_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                num_channels,
+                firstconv_output_channels,
+                kernel_size=3,
+                stride=2,
+                norm_layer=norm_layer,
+                activation_layer=nn.SiLU,
+            )
         )
+        # building inverted residual blocks
+        total_stage_blocks = sum(cnf.num_layers for cnf in inverted_residual_setting)
+        stage_block_id = 0
+        for cnf in inverted_residual_setting:
+            stage: List[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # copy to avoid modifications. shallow copy is enough
+                block_cnf = copy.copy(cnf)
+                # overwrite info if not the first conv in the stage
+                if stage:
+                    block_cnf.input_channels = block_cnf.out_channels
+                    block_cnf.stride = 1
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = (
+                    stochastic_depth_prob * float(stage_block_id) / total_stage_blocks
+                )
+                stage.append(block_cnf.block(block_cnf, sd_prob, norm_layer))
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+        # building last several layers
+        lastconv_input_channels = inverted_residual_setting[-1].out_channels
+        lastconv_output_channels = (
+            last_channel if last_channel is not None else 4 * lastconv_input_channels
+        )
+        layers.append(
+            Conv2dNormActivation(
+                lastconv_input_channels,
+                lastconv_output_channels,
+                kernel_size=1,
+                norm_layer=norm_layer,
+                activation_layer=nn.SiLU,
+            )
+        )
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout, inplace=True),
+            nn.Linear(lastconv_output_channels, num_classes),
+        )
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out")
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                init_range = 1.0 / math.sqrt(m.out_features)
+                nn.init.uniform_(m.weight, -init_range, init_range)
+                nn.init.zeros_(m.bias)
+        # super().__init__(**kwargs)
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+# def _efficientnet(
+#     inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]],
+#     dropout: float,
+#     last_channel: Optional[int],
+#     weights=None,
+#     num_channels: int = 61,
+#     stochastic_depth_prob: float = 0.2,
+#     progress: bool = True,
+#     num_classes: int = 2,
+#     **kwargs: Any,
+# ) -> EfficientNetCongig:
+#     model = EfficientNetCongif(
+#         inverted_residual_setting,
+#         dropout,
+#         num_classes=num_classes,
+#         num_channels=num_channels,
+#         stochastic_depth_prob=stochastic_depth_prob,
+#         last_channel=last_channel,
+#         **kwargs,
+#     )
+#     return model
+def _efficientnet_conf(
+    arch: str,
+    **kwargs: Any,
+) -> Tuple[Sequence[Union[MBConvConfig, FusedMBConvConfig]], Optional[int]]:
+    inverted_residual_setting: Sequence[Union[MBConvConfig, FusedMBConvConfig]]
+    if arch.startswith("efficientnet_b"):
+        bneck_conf = partial(
+            MBConvConfig,
+            width_mult=kwargs.pop("width_mult"),
+            depth_mult=kwargs.pop("depth_mult"),
+        )
+        inverted_residual_setting = [
+            bneck_conf(1, 3, 1, 32, 16, 1),
+            bneck_conf(6, 3, 2, 16, 24, 2),
+            bneck_conf(6, 5, 2, 24, 40, 2),
+            bneck_conf(6, 3, 2, 40, 80, 3),
+            bneck_conf(6, 5, 1, 80, 112, 3),
+            bneck_conf(6, 5, 2, 112, 192, 4),
+            bneck_conf(6, 3, 1, 192, 320, 1),
+        ]
+        last_channel = None
+    elif arch.startswith("efficientnet_v2_s"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 24, 24, 2),
+            FusedMBConvConfig(4, 3, 2, 24, 48, 4),
+            FusedMBConvConfig(4, 3, 2, 48, 64, 4),
+            MBConvConfig(4, 3, 2, 64, 128, 6),
+            MBConvConfig(6, 3, 1, 128, 160, 9),
+            MBConvConfig(6, 3, 2, 160, 256, 15),
+        ]
+        last_channel = 1280
+    elif arch.startswith("efficientnet_v2_m"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 24, 24, 3),
+            FusedMBConvConfig(4, 3, 2, 24, 48, 5),
+            FusedMBConvConfig(4, 3, 2, 48, 80, 5),
+            MBConvConfig(4, 3, 2, 80, 160, 7),
+            MBConvConfig(6, 3, 1, 160, 176, 14),
+            MBConvConfig(6, 3, 2, 176, 304, 18),
+            MBConvConfig(6, 3, 1, 304, 512, 5),
+        ]
+        last_channel = 1280
+    elif arch.startswith("efficientnet_v2_l"):
+        inverted_residual_setting = [
+            FusedMBConvConfig(1, 3, 1, 32, 32, 4),
+            FusedMBConvConfig(4, 3, 2, 32, 64, 7),
+            FusedMBConvConfig(4, 3, 2, 64, 96, 7),
+            MBConvConfig(4, 3, 2, 96, 192, 10),
+            MBConvConfig(6, 3, 1, 192, 224, 19),
+            MBConvConfig(6, 3, 2, 224, 384, 25),
+            MBConvConfig(6, 3, 1, 384, 640, 7),
+        ]
+        last_channel = 1280
+    else:
+        raise ValueError(f"Unsupported model type {arch}")
+    return inverted_residual_setting, last_channel
+##### normal stuff ####
 def normalize_array(x: list):