Spaces:

odunkel
/

DIY-SC

Running on Zero

App Files Files Community

odunkel commited on Jun 5

Commit

79cc514

verified ·

1 Parent(s): f3b59a6

Upload 9 files

Browse files

Files changed (9) hide show

app.py +243 -0
ckpts/dino_spair_0300.pth +3 -0
model_utils/__pycache__/extractor_dino.cpython-310.pyc +0 -0
model_utils/__pycache__/projection_network.cpython-310.pyc +0 -0
model_utils/__pycache__/resnet.cpython-310.pyc +0 -0
model_utils/extractor_dino.py +356 -0
model_utils/projection_network.py +167 -0
model_utils/resnet.py +518 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import gradio as gr
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image, ImageDraw
+from matplotlib import cm
+from model_utils.extractor_dino import ViTExtractor
+from model_utils.projection_network import AggregationNetwork, DummyAggregationNetwork
+def resize(img, target_res=224, resize=True, to_pil=True, edge=False, sampling_filter='lanczos'):
+    filt = Image.Resampling.LANCZOS if sampling_filter == 'lanczos' else Image.Resampling.NEAREST
+    original_width, original_height = img.size
+    original_channels = len(img.getbands())
+    if not edge:
+        canvas = np.zeros([target_res, target_res, 3], dtype=np.uint8)
+        if original_channels == 1:
+            canvas = np.zeros([target_res, target_res], dtype=np.uint8)
+        if original_height <= original_width:
+            if resize:
+                img = img.resize((target_res, int(np.around(target_res * original_height / original_width))), filt)
+            width, height = img.size
+            img = np.asarray(img)
+            canvas[(width - height) // 2: (width + height) // 2] = img
+        else:
+            if resize:
+                img = img.resize((int(np.around(target_res * original_width / original_height)), target_res), filt)
+            width, height = img.size
+            img = np.asarray(img)
+            canvas[:, (height - width) // 2: (height + width) // 2] = img
+    else:
+        if original_height <= original_width:
+            if resize:
+                img = img.resize((target_res, int(np.around(target_res * original_height / original_width))), filt)
+            width, height = img.size
+            img = np.asarray(img)
+            top_pad = (target_res - height) // 2
+            bottom_pad = target_res - height - top_pad
+            img = np.pad(img, pad_width=[(top_pad, bottom_pad), (0, 0), (0, 0)], mode='edge')
+        else:
+            if resize:
+                img = img.resize((int(np.around(target_res * original_width / original_height)), target_res), filt)
+            width, height = img.size
+            img = np.asarray(img)
+            left_pad = (target_res - width) // 2
+            right_pad = target_res - width - left_pad
+            img = np.pad(img, pad_width=[(0, 0), (left_pad, right_pad), (0, 0)], mode='edge')
+        canvas = img
+    if to_pil:
+        canvas = Image.fromarray(canvas)
+    return canvas
+# ─── Configuration ───────────────────────────────────────────────
+num_patches = 30
+target_res = num_patches * 14
+ckpt_file = "ckpts/dino_spair_0300.pth"
+# ─── Model setup ─────────────────────────────────────────────────
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+aggre_net = AggregationNetwork(feature_dims=[768], projection_dim=768, device=device)
+aggre_net.load_pretrained_weights(torch.load(ckpt_file, map_location=device))
+aggre_net_dummy  = DummyAggregationNetwork()
+extractor_vit = ViTExtractor('dinov2_vitb14', stride=14, device=device)
+# ─── Feature extraction ──────────────────────────────────────────
+def get_processed_features_dino(num_patches, img,use_dummy):
+    batch = extractor_vit.preprocess_pil(img)
+    features_dino = extractor_vit.extract_descriptors(batch.cuda(), layer=11, facet='token') \
+                                    .permute(0,1,3,2) \
+                                    .reshape(1, -1, num_patches, num_patches)
+    # Project + normalize
+    with torch.no_grad():
+        if use_dummy == "DINOv2":
+            desc = aggre_net_dummy(features_dino)
+        else:
+            desc = aggre_net(features_dino)
+        norms = torch.linalg.norm(desc, dim=1, keepdim=True)
+        desc = desc / (norms + 1e-8)
+    return desc  # shape [1, C, num_patches, num_patches]
+# ─── Similarity computation ───────────────────────────────────────
+def get_sim(
+    coord: tuple[int,int],
+    feat1: torch.Tensor,
+    feat2: torch.Tensor,
+    img_size: int = target_res
+) -> np.ndarray:
+    """
+    Upsamples the DINO features to `img_size`, then computes cosine‐similarity
+    between the feature at `coord` in source and every spatial location in target.
+    """
+    y, x = coord  # row, col
+    # Upsample both feature maps to [1, C, img_size, img_size]
+    upsampler = nn.Upsample(size=(img_size, img_size), mode='bilinear', align_corners=False)
+    src_ft = upsampler(feat1)  # [1, C, img_size, img_size]
+    trg_ft = upsampler(feat2)
+    # Extract the C‐dim vector at the clicked location
+    C = src_ft.size(1)
+    src_vec = src_ft[0, :, y, x].view(1, C, 1, 1)  # [1, C, 1, 1]
+    # Cosine similarity along channel‐dim
+    cos = nn.CosineSimilarity(dim=1)
+    cos_map = cos(src_vec, trg_ft)[0]           # [img_size, img_size]
+    return cos_map.cpu().numpy()
+# ─── Drawing helper ───────────────────────────────────────────────
+def draw_point(img_arr: np.ndarray, x: int, y: int, size: int, color=(255,0,0)) -> np.ndarray:
+    pil = Image.fromarray(img_arr)
+    draw = ImageDraw.Draw(pil)
+    r = size // 2
+    draw.ellipse((x-r, y-r, x+r, y+r), fill=color, outline=color)
+    return np.array(pil)
+# ─── Feature‐updating callback ───────────────────────────────────
+def update_features(
+    img: Image,
+    num_patches,
+    use_dummy
+):
+    """
+    Given a PIL image, returns:
+      1) the same PIL image (so it can be displayed)
+      2) its DINO descriptor tensor, stored in a gr.State
+    """
+    if img is None:
+        return None, None, None
+    img = resize(img, target_res=target_res, resize=True, to_pil=True)
+    feat = get_processed_features_dino(num_patches, img=img,use_dummy=use_dummy)
+    return img, feat.cpu(), Image.fromarray(np.array(img))
+# ─── Click handler ───────────────────────────────────────────────
+def on_select(
+    source_pil: Image,
+    target_pil: Image,
+    feat1: torch.Tensor,
+    feat2: torch.Tensor,
+    alpha: float,
+    scatter_size: int,
+    or_tgt_img: Image,
+    or_src_img: Image,
+    sel: gr.SelectData
+):
+    # Convert to numpy arrays
+    src_arr = np.array(or_src_img)
+    tgt_arr = np.array(or_tgt_img)
+    # Get click coords (row, col)
+    y, x = sel.index
+    src_marked = draw_point(src_arr, y, x, scatter_size)
+    # Compute similarity map
+    sim_map = get_sim((x, y), feat1, feat2, img_size=target_res)
+    mn, mx = sim_map.min(), sim_map.max()
+    sim_norm = (sim_map - mn) / ((mx - mn) + 1e-12)
+    # Build RGBA heatmap
+    heat = cm.viridis(sim_norm)           # H×W×4
+    heat[..., 3] = sim_norm * alpha       # alpha channel
+    # Composite over fresh target
+    tgt_f = tgt_arr.astype(np.float32) / 255.0
+    comp = heat[..., :3] * heat[..., 3:4] + tgt_f * (1 - heat[..., 3:4])
+    overlay = (comp * 255).astype(np.uint8)
+    # Draw a red dot at the best match
+    my, mx_ = np.unravel_index(sim_map.argmax(), sim_map.shape)
+    overlay_marked = draw_point(overlay, mx_, my, scatter_size)
+    return src_marked,overlay_marked
+def reload_img(
+        or_src_img: Image,
+        or_tgt_img: Image,
+):
+    return or_src_img,or_tgt_img
+# ─── Build Gradio UI ──────────────────────────────────────────────
+with gr.Blocks() as demo:
+    # Hidden states to hold features
+    feat1_state = gr.State()
+    feat2_state = gr.State()
+    or_tgt_img = gr.State()
+    or_src_img = gr.State()
+    # Introduction text box
+    intro_text = gr.Markdown("""
+    ## Do It Yourself: Learning Semantic Correspondence from Pseudo-Labels
+    [Project Page](https://example.com) | [GitHub Repository](https://github.com/example/repo)
+    Welcome to the DIY-SC demo!
+    Upload two images and select a keypoint in the source image. This demo will compute and visualize the feature similarity map and a corresponding point in the target image.
+    You can choose between the DIY-SC (DINOv2) and a DINOv2 feature extractor.
+    """)
+    # Image upload / display components
+    with gr.Row():
+        src = gr.Image(interactive=True, type="pil", label="Source Image")
+        tgt = gr.Image(interactive=True, type="pil", label="Target Image")
+    # Controls
+    alpha   = gr.State(0.7)
+    scatter = gr.State(10)
+    use_dummy = gr.Radio(["DIY-SC", "DINOv2"], value="DIY-SC", label="Feature Extractor")
+    src.input(
+        fn=update_features,
+        inputs=[src, gr.State(num_patches),use_dummy],
+        outputs=[src, feat1_state,or_src_img,]
+    )
+    tgt.input(
+        fn=update_features,
+        inputs=[tgt, gr.State(num_patches),use_dummy],
+        outputs=[tgt, feat2_state,or_tgt_img]
+    )
+    use_dummy.change(
+        fn=update_features,
+        inputs=[or_src_img, gr.State(num_patches), use_dummy],
+        outputs=[src, feat1_state, or_src_img]
+    )
+    use_dummy.change(
+        fn=update_features,
+        inputs=[or_tgt_img, gr.State(num_patches), use_dummy],
+        outputs=[tgt, feat2_state, or_tgt_img]
+    )
+    src.select(
+        fn=on_select,
+        inputs=[src, tgt, feat1_state, feat2_state, alpha, scatter,or_tgt_img,or_src_img],
+        outputs=[src,tgt]
+    )
+demo.launch(share=True)

ckpts/dino_spair_0300.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42e9c8a4d27041af7bb8dc5240ea3e83d47e013b72541dc8c5829368f247e705
+size 2519767

model_utils/__pycache__/extractor_dino.cpython-310.pyc ADDED Viewed

Binary file (14.5 kB). View file

model_utils/__pycache__/projection_network.cpython-310.pyc ADDED Viewed

Binary file (5.36 kB). View file

model_utils/__pycache__/resnet.cpython-310.pyc ADDED Viewed

Binary file (15.4 kB). View file

model_utils/extractor_dino.py ADDED Viewed

	@@ -0,0 +1,356 @@

+import torch
+from torch import nn
+from torchvision import transforms
+import torch.nn.modules.utils as nn_utils
+import math
+import types
+from pathlib import Path
+from typing import Union, List, Tuple
+from PIL import Image
+class ViTExtractor:
+    """ This class facilitates extraction of features, descriptors, and saliency maps from a ViT.
+    We use the following notation in the documentation of the module's methods:
+    B - batch size
+    h - number of heads. usually takes place of the channel dimension in pytorch's convention BxCxHxW
+    p - patch size of the ViT. either 8 or 16.
+    t - number of tokens. equals the number of patches + 1, e.g. HW / p**2 + 1. Where H and W are the height and width
+    of the input image.
+    d - the embedding dimension in the ViT.
+    """
+    def __init__(self, model_type: str = 'dino_vits8', stride: int = 4, model: nn.Module = None, device: str = 'cuda'):
+        """
+        :param model_type: A string specifying the type of model to extract from.
+                          [dino_vits8 | dino_vits16 | dino_vitb8 | dino_vitb16 | vit_small_patch8_224 |
+                          vit_small_patch16_224 | vit_base_patch8_224 | vit_base_patch16_224]
+        :param stride: stride of first convolution layer. small stride -> higher resolution.
+        :param model: Optional parameter. The nn.Module to extract from instead of creating a new one in ViTExtractor.
+                      should be compatible with model_type.
+        """
+        self.model_type = model_type
+        self.device = device
+        if model is not None:
+            self.model = model
+        else:
+            self.model = ViTExtractor.create_model(model_type)
+        self.model = ViTExtractor.patch_vit_resolution(self.model, stride=stride)
+        self.model.eval()
+        self.model.to(self.device)
+        self.p = self.model.patch_embed.patch_size
+        if type(self.p)==tuple:
+            self.p = self.p[0]
+        self.stride = self.model.patch_embed.proj.stride
+        self.mean = (0.485, 0.456, 0.406) if "dino" in self.model_type else (0.5, 0.5, 0.5)
+        self.std = (0.229, 0.224, 0.225) if "dino" in self.model_type else (0.5, 0.5, 0.5)
+        self._feats = []
+        self.hook_handlers = []
+        self.load_size = None
+        self.num_patches = None
+    @staticmethod
+    def create_model(model_type: str) -> nn.Module:
+        """
+        :param model_type: a string specifying which model to load. [dino_vits8 | dino_vits16 | dino_vitb8 |
+                           dino_vitb16 | vit_small_patch8_224 | vit_small_patch16_224 | vit_base_patch8_224 |
+                           vit_base_patch16_224]
+        :return: the model
+        """
+        torch.hub._validate_not_a_forked_repo=lambda a,b,c: True
+        if 'v2' in model_type:
+            model = torch.hub.load('facebookresearch/dinov2', model_type)
+        elif 'dino' in model_type:
+            model = torch.hub.load('facebookresearch/dino:main', model_type)
+        elif 'ibot' in model_type:
+            model = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')
+            temp_state_dict = torch.load("ibot/checkpoint_teacher.pth", map_location="cpu")
+            temp_state_dict = temp_state_dict["state_dict"]
+            # remove `module.` prefix
+            temp_state_dict = {k.replace("module.", ""): v for k, v in temp_state_dict.items()}
+            # remove `backbone.` prefix induced by multicrop wrapper
+            temp_state_dict = {k.replace("backbone.", ""): v for k, v in temp_state_dict.items()}
+            msg = model.load_state_dict(temp_state_dict, strict=False)
+            print(msg)
+        else:  # model from timm -- load weights from timm to dino model (enables working on arbitrary size images).
+            import timm
+            temp_model = timm.create_model(model_type, pretrained=True)
+            model_type_dict = {
+            'vit_small_patch16_224': 'dino_vits16',
+            'vit_small_patch8_224': 'dino_vits8',
+            'vit_base_patch16_224': 'dino_vitb16',
+            'vit_base_patch8_224': 'dino_vitb8'
+            }
+            model = torch.hub.load('facebookresearch/dino:main', model_type_dict[model_type])
+            temp_state_dict = temp_model.state_dict()
+            del temp_state_dict['head.weight']
+            del temp_state_dict['head.bias']
+            model.load_state_dict(temp_state_dict)
+        return model
+    @staticmethod
+    def _fix_pos_enc(patch_size: int, stride_hw: Tuple[int, int]):
+        """
+        Creates a method for position encoding interpolation.
+        :param patch_size: patch size of the model.
+        :param stride_hw: A tuple containing the new height and width stride respectively.
+        :return: the interpolation method
+        """
+        def interpolate_pos_encoding(self, x: torch.Tensor, w: int, h: int) -> torch.Tensor:
+            npatch = x.shape[1] - 1
+            N = self.pos_embed.shape[1] - 1
+            if npatch == N and w == h:
+                return self.pos_embed
+            class_pos_embed = self.pos_embed[:, 0]
+            patch_pos_embed = self.pos_embed[:, 1:]
+            dim = x.shape[-1]
+            # compute number of tokens taking stride into account
+            w0 = 1 + (w - patch_size) // stride_hw[1]
+            h0 = 1 + (h - patch_size) // stride_hw[0]
+            assert (w0 * h0 == npatch), f"""got wrong grid size for {h}x{w} with patch_size {patch_size} and
+                                            stride {stride_hw} got {h0}x{w0}={h0 * w0} expecting {npatch}"""
+            # we add a small number to avoid floating point error in the interpolation
+            # see discussion at https://github.com/facebookresearch/dino/issues/8
+            w0, h0 = w0 + 0.1, h0 + 0.1
+            patch_pos_embed = nn.functional.interpolate(
+                patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+                scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+                mode='bicubic',
+                align_corners=False, recompute_scale_factor=False
+            )
+            assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+        return interpolate_pos_encoding
+    @staticmethod
+    def patch_vit_resolution(model: nn.Module, stride: int) -> nn.Module:
+        """
+        change resolution of model output by changing the stride of the patch extraction.
+        :param model: the model to change resolution for.
+        :param stride: the new stride parameter.
+        :return: the adjusted model
+        """
+        patch_size = model.patch_embed.patch_size
+        if type(patch_size) == tuple:
+            patch_size = patch_size[0]
+        if stride == patch_size:  # nothing to do
+            return model
+        stride = nn_utils._pair(stride)
+        assert all([(patch_size // s_) * s_ == patch_size for s_ in
+                    stride]), f'stride {stride} should divide patch_size {patch_size}'
+        # fix the stride
+        model.patch_embed.proj.stride = stride
+        # fix the positional encoding code
+        model.interpolate_pos_encoding = types.MethodType(ViTExtractor._fix_pos_enc(patch_size, stride), model)
+        return model
+    def preprocess(self, image_path: Union[str, Path],
+                   load_size: Union[int, Tuple[int, int]] = None, patch_size: int = 14) -> Tuple[torch.Tensor, Image.Image]:
+        """
+        Preprocesses an image before extraction.
+        :param image_path: path to image to be extracted.
+        :param load_size: optional. Size to resize image before the rest of preprocessing.
+        :return: a tuple containing:
+                    (1) the preprocessed image as a tensor to insert the model of shape BxCxHxW.
+                    (2) the pil image in relevant dimensions
+        """
+        def divisible_by_num(num, dim):
+            return num * (dim // num)
+        pil_image = Image.open(image_path).convert('RGB')
+        if load_size is not None:
+            pil_image = transforms.Resize(load_size, interpolation=transforms.InterpolationMode.LANCZOS)(pil_image)
+            width, height = pil_image.size
+            new_width = divisible_by_num(patch_size, width)
+            new_height = divisible_by_num(patch_size, height)
+            pil_image = pil_image.resize((new_width, new_height), resample=Image.LANCZOS)
+        prep = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std)
+        ])
+        prep_img = prep(pil_image)[None, ...]
+        return prep_img, pil_image
+    def preprocess_pil(self, pil_image):
+        """
+        Preprocesses an image before extraction.
+        :param image_path: path to image to be extracted.
+        :param load_size: optional. Size to resize image before the rest of preprocessing.
+        :return: a tuple containing:
+                    (1) the preprocessed image as a tensor to insert the model of shape BxCxHxW.
+                    (2) the pil image in relevant dimensions
+        """
+        prep = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=self.mean, std=self.std)
+        ])
+        prep_img = prep(pil_image)[None, ...]
+        return prep_img
+    def _get_hook(self, facet: str):
+        """
+        generate a hook method for a specific block and facet.
+        """
+        if facet in ['attn', 'token']:
+            def _hook(model, input, output):
+                self._feats.append(output)
+            return _hook
+        if facet == 'query':
+            facet_idx = 0
+        elif facet == 'key':
+            facet_idx = 1
+        elif facet == 'value':
+            facet_idx = 2
+        else:
+            raise TypeError(f"{facet} is not a supported facet.")
+        def _inner_hook(module, input, output):
+            input = input[0]
+            B, N, C = input.shape
+            qkv = module.qkv(input).reshape(B, N, 3, module.num_heads, C // module.num_heads).permute(2, 0, 3, 1, 4)
+            self._feats.append(qkv[facet_idx]) #Bxhxtxd
+        return _inner_hook
+    def _register_hooks(self, layers: List[int], facet: str) -> None:
+        """
+        register hook to extract features.
+        :param layers: layers from which to extract features.
+        :param facet: facet to extract. One of the following options: ['key' | 'query' | 'value' | 'token' | 'attn']
+        """
+        for block_idx, block in enumerate(self.model.blocks):
+            if block_idx in layers:
+                if facet == 'token':
+                    self.hook_handlers.append(block.register_forward_hook(self._get_hook(facet)))
+                elif facet == 'attn':
+                    self.hook_handlers.append(block.attn.attn_drop.register_forward_hook(self._get_hook(facet)))
+                elif facet in ['key', 'query', 'value']:
+                    self.hook_handlers.append(block.attn.register_forward_hook(self._get_hook(facet)))
+                else:
+                    raise TypeError(f"{facet} is not a supported facet.")
+    def _unregister_hooks(self) -> None:
+        """
+        unregisters the hooks. should be called after feature extraction.
+        """
+        for handle in self.hook_handlers:
+            handle.remove()
+        self.hook_handlers = []
+    def _extract_features(self, batch: torch.Tensor, layers: List[int] = 11, facet: str = 'key') -> List[torch.Tensor]:
+        """
+        extract features from the model
+        :param batch: batch to extract features for. Has shape BxCxHxW.
+        :param layers: layer to extract. A number between 0 to 11.
+        :param facet: facet to extract. One of the following options: ['key' | 'query' | 'value' | 'token' | 'attn']
+        :return : tensor of features.
+                  if facet is 'key' | 'query' | 'value' has shape Bxhxtxd
+                  if facet is 'attn' has shape Bxhxtxt
+                  if facet is 'token' has shape Bxtxd
+        """
+        B, C, H, W = batch.shape
+        self._feats = []
+        self._register_hooks(layers, facet)
+        _ = self.model(batch)
+        self._unregister_hooks()
+        self.load_size = (H, W)
+        self.num_patches = (1 + (H - self.p) // self.stride[0], 1 + (W - self.p) // self.stride[1])
+        return self._feats
+    def _log_bin(self, x: torch.Tensor, hierarchy: int = 2) -> torch.Tensor:
+        """
+        create a log-binned descriptor.
+        :param x: tensor of features. Has shape Bxhxtxd. [1,6,3410,64]
+        :param hierarchy: how many bin hierarchies to use.
+        """
+        B = x.shape[0]
+        num_bins = 1 + 8 * hierarchy
+        bin_x = x.permute(0, 2, 3, 1).flatten(start_dim=-2, end_dim=-1)  # Bx(t-1)x(dxh) [1,3410,384]
+        bin_x = bin_x.permute(0, 2, 1)
+        bin_x = bin_x.reshape(B, bin_x.shape[1], self.num_patches[0], self.num_patches[1])
+        # Bx(dxh)xnum_patches[0]xnum_patches[1]
+        sub_desc_dim = bin_x.shape[1]
+        avg_pools = []
+        # compute bins of all sizes for all spatial locations.
+        for k in range(0, hierarchy):
+            # avg pooling with kernel 3**kx3**k
+            win_size = 3 ** k
+            avg_pool = torch.nn.AvgPool2d(win_size, stride=1, padding=win_size // 2, count_include_pad=False)
+            avg_pools.append(avg_pool(bin_x))
+        bin_x = torch.zeros((B, sub_desc_dim * num_bins, self.num_patches[0], self.num_patches[1])).to(self.device)
+        for y in range(self.num_patches[0]):
+            for x in range(self.num_patches[1]):
+                part_idx = 0
+                # fill all bins for a spatial location (y, x)
+                for k in range(0, hierarchy):
+                    kernel_size = 3 ** k
+                    for i in range(y - kernel_size, y + kernel_size + 1, kernel_size):
+                        for j in range(x - kernel_size, x + kernel_size + 1, kernel_size):
+                            if i == y and j == x and k != 0:
+                                continue
+                            if 0 <= i < self.num_patches[0] and 0 <= j < self.num_patches[1]:
+                                bin_x[:, part_idx * sub_desc_dim: (part_idx + 1) * sub_desc_dim, y, x] = avg_pools[k][
+                                                                                                           :, :, i, j]
+                            else:  # handle padding in a more delicate way than zero padding
+                                temp_i = max(0, min(i, self.num_patches[0] - 1))
+                                temp_j = max(0, min(j, self.num_patches[1] - 1))
+                                bin_x[:, part_idx * sub_desc_dim: (part_idx + 1) * sub_desc_dim, y, x] = avg_pools[k][
+                                                                                                           :, :, temp_i,
+                                                                                                           temp_j]
+                            part_idx += 1
+        bin_x = bin_x.flatten(start_dim=-2, end_dim=-1).permute(0, 2, 1).unsqueeze(dim=1)
+        # Bx1x(t-1)x(dxh)
+        return bin_x #[1,1,3410,6528]
+    def extract_descriptors(self, batch: torch.Tensor, layer: int = 11, facet: str = 'key',
+                            bin: bool = False, include_cls: bool = False) -> torch.Tensor:
+        """
+        extract descriptors from the model
+        :param batch: batch to extract descriptors for. Has shape BxCxHxW.
+        :param layers: layer to extract. A number between 0 to 11.
+        :param facet: facet to extract. One of the following options: ['key' | 'query' | 'value' | 'token']
+        :param bin: apply log binning to the descriptor. default is False.
+        :return: tensor of descriptors. Bx1xtxd' where d' is the dimension of the descriptors.
+        """
+        assert facet in ['key', 'query', 'value', 'token'], f"""{facet} is not a supported facet for descriptors.
+                                                             choose from ['key' | 'query' | 'value' | 'token'] """
+        self._extract_features(batch, [layer], facet)
+        x = self._feats[0]
+        if facet == 'token':
+            x.unsqueeze_(dim=1) #Bx1xtxd
+        if not include_cls:
+            x = x[:, :, 1:, :]  # remove cls token
+        else:
+            assert not bin, "bin = True and include_cls = True are not supported together, set one of them False."
+        if not bin:
+            desc = x.permute(0, 2, 3, 1).flatten(start_dim=-2, end_dim=-1).unsqueeze(dim=1)  # Bx1xtx(dxh)
+        else:
+            desc = self._log_bin(x)
+        return desc
+    def extract_saliency_maps(self, batch: torch.Tensor) -> torch.Tensor:
+        """
+        extract saliency maps. The saliency maps are extracted by averaging several attention heads from the last layer
+        in of the CLS token. All values are then normalized to range between 0 and 1.
+        :param batch: batch to extract saliency maps for. Has shape BxCxHxW.
+        :return: a tensor of saliency maps. has shape Bxt-1
+        """
+        assert self.model_type == "dino_vits8", f"saliency maps are supported only for dino_vits model_type."
+        self._extract_features(batch, [11], 'attn')
+        head_idxs = [0, 2, 4, 5]
+        curr_feats = self._feats[0] #Bxhxtxt
+        cls_attn_map = curr_feats[:, head_idxs, 0, 1:].mean(dim=1) #Bx(t-1)
+        temp_mins, temp_maxs = cls_attn_map.min(dim=1)[0], cls_attn_map.max(dim=1)[0]
+        cls_attn_maps = (cls_attn_map - temp_mins) / (temp_maxs - temp_mins)  # normalize to range [0,1]
+        return cls_attn_maps

model_utils/projection_network.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import numpy as np
+import torch
+from torch import nn
+from model_utils.resnet import ResNet, BottleneckBlock
+import torch.nn.functional as F
+class DummyAggregationNetwork(nn.Module): # for testing, return the input
+    def __init__(self):
+        super(DummyAggregationNetwork, self).__init__()
+        # dummy paprameter
+        self.dummy = nn.Parameter(torch.ones([]))
+    def forward(self, batch, pose=None):
+        return batch * self.dummy
+class AggregationNetwork(nn.Module):
+    """
+    Module for aggregating feature maps across time and space.
+    Design inspired by the Feature Extractor from ODISE (Xu et. al., CVPR 2023).
+    https://github.com/NVlabs/ODISE/blob/5836c0adfcd8d7fd1f8016ff5604d4a31dd3b145/odise/modeling/backbone/feature_extractor.py
+    """
+    def __init__(
+            self,
+            device,
+            feature_dims=[640,1280,1280,768],
+            projection_dim=384,
+            num_norm_groups=32,
+            save_timestep=[1],
+            kernel_size = [1,3,1],
+            contrastive_temp = 10,
+            feat_map_dropout=0.0,
+            num_blocks=None,
+            bottleneck_channels=None
+        ):
+        super().__init__()
+        self.skip_connection = True
+        self.feat_map_dropout = feat_map_dropout
+        self.azimuth_embedding = None
+        self.pos_embedding = None
+        self.bottleneck_layers = nn.ModuleList()
+        self.feature_dims = feature_dims
+        self.num_blocks = num_blocks if num_blocks is not None else 1
+        self.bottleneck_channels = bottleneck_channels if bottleneck_channels is not None else projection_dim//4
+        # For CLIP symmetric cross entropy loss during training
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.self_logit_scale = nn.Parameter(torch.ones([]) * np.log(contrastive_temp))
+        self.device = device
+        self.save_timestep = save_timestep
+        self.mixing_weights_names = []
+        for l, feature_dim in enumerate(self.feature_dims):
+            bottleneck_layer = nn.Sequential(
+                *ResNet.make_stage(
+                    BottleneckBlock,
+                    num_blocks=self.num_blocks,
+                    in_channels=feature_dim,
+                    bottleneck_channels=self.bottleneck_channels,
+                    out_channels=projection_dim,
+                    norm="GN",
+                    num_norm_groups=num_norm_groups,
+                    kernel_size=kernel_size
+                )
+            )
+            self.bottleneck_layers.append(bottleneck_layer)
+            for t in save_timestep:
+                # 1-index the layer name following prior work
+                self.mixing_weights_names.append(f"timestep-{save_timestep}_layer-{l+1}")
+        self.last_layer = None
+        self.bottleneck_layers = self.bottleneck_layers.to(device)
+        mixing_weights = torch.ones(len(self.bottleneck_layers) * len(save_timestep))
+        self.mixing_weights = nn.Parameter(mixing_weights.to(device))
+        # count number of parameters
+        num_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+        print(f"AggregationNetwork has {num_params} parameters.")
+    def load_pretrained_weights(self, pretrained_dict):
+        custom_dict = self.state_dict()
+        # Handle size mismatch
+        if 'mixing_weights' in custom_dict and 'mixing_weights' in pretrained_dict and custom_dict['mixing_weights'].shape != pretrained_dict['mixing_weights'].shape:
+            # Keep the first four weights from the pretrained model, and randomly initialize the fifth weight
+            custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4]
+            custom_dict['mixing_weights'][4] = torch.zeros_like(custom_dict['mixing_weights'][4])
+        else:
+            custom_dict['mixing_weights'][:4] = pretrained_dict['mixing_weights'][:4]
+        # Load the weights that do match
+        matching_keys = {k: v for k, v in pretrained_dict.items() if k in custom_dict and k != 'mixing_weights'}
+        custom_dict.update(matching_keys)
+        # Now load the updated state_dict
+        self.load_state_dict(custom_dict, strict=False)
+    def forward(self, batch, pose=None):
+        """
+        Assumes batch is shape (B, C, H, W) where C is the concatentation of all layer features.
+        """
+        if self.feat_map_dropout > 0 and self.training:
+            batch = F.dropout(batch, p=self.feat_map_dropout)
+        output_feature = None
+        start = 0
+        mixing_weights = torch.nn.functional.softmax(self.mixing_weights, dim=0)
+        if self.pos_embedding is not None: #position embedding
+            batch = torch.cat((batch, self.pos_embedding), dim=1)
+        for i in range(len(mixing_weights)):
+            # Share bottleneck layers across timesteps
+            bottleneck_layer = self.bottleneck_layers[i % len(self.feature_dims)]
+            # Chunk the batch according the layer
+            # Account for looping if there are multiple timesteps
+            end = start + self.feature_dims[i % len(self.feature_dims)]
+            feats = batch[:, start:end, :, :]
+            start = end
+            # Downsample the number of channels and weight the layer
+            bottlenecked_feature = bottleneck_layer(feats)
+            bottlenecked_feature = mixing_weights[i] * bottlenecked_feature
+            if output_feature is None:
+                output_feature = bottlenecked_feature
+            else:
+                output_feature += bottlenecked_feature
+        if self.last_layer is not None:
+            output_feature_after = self.last_layer(output_feature)
+            if self.skip_connection:
+                # skip connection
+                output_feature = output_feature + output_feature_after
+        return output_feature
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution without padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=0, bias=False)
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+class BasicBlock(nn.Module):
+    def __init__(self, in_planes, planes, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(in_planes, planes, stride)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                conv1x1(in_planes, planes, stride=stride),
+                nn.BatchNorm2d(planes)
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.bn1(self.conv1(y)))
+        y = self.bn2(self.conv2(y))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x+y)

model_utils/resnet.py ADDED Viewed

	@@ -0,0 +1,518 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+# import fvcore.nn.weight_init as weight_init
+import numpy as np
+"""
+Functions for building the BottleneckBlock from Detectron2.
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/resnet.py
+"""
+def get_norm(norm, out_channels, num_norm_groups=32):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "GN": lambda channels: nn.GroupNorm(num_norm_groups, channels),
+        }[norm]
+    return norm(out_channels)
+def get_activation(activation):
+    """
+    Args:
+        activation (str or callable): either one of relu, lrelu, prelu, leaky_relu,
+            sigmoid, tanh, elu, selu, swish, mish; or a callable that takes a
+            tensor and returns a tensor.
+    Returns:
+        nn.Module or None: the activation layer
+    """
+    if activation is None:
+        return None
+    if isinstance(activation, str):
+        if len(activation) == 0:
+            return None
+        activation = {
+            "relu": nn.ReLU,
+            "lrelu": nn.LeakyReLU,
+            "prelu": nn.PReLU,
+            "leaky_relu": nn.LeakyReLU,
+            "sigmoid": nn.Sigmoid,
+            "tanh": nn.Tanh,
+            "elu": nn.ELU,
+            "selu": nn.SELU,
+        }[activation]
+    return activation()
+# SCE crisscross + diags
+class EfficientSpatialContextNet(nn.Module):
+    def __init__(self, kernel_size=7, in_channels=768, out_channels=768, use_cuda=True):
+        super(EfficientSpatialContextNet, self).__init__()
+        self.kernel_size = kernel_size
+        self.pad = kernel_size // 2
+        self.conv = torch.nn.Conv2d(
+            in_channels + 4*self.kernel_size,
+            out_channels,
+            1,
+            bias=True,
+            padding_mode="zeros",
+        )
+        if use_cuda:
+            self.conv = self.conv.cuda()
+    def forward(self, feature):
+        b, c, h, w = feature.size()
+        feature_normalized = F.normalize(feature, p=2, dim=1)
+        feature_pad = F.pad(
+            feature_normalized, (self.pad, self.pad, self.pad, self.pad), "constant", 0
+        )
+        output = torch.zeros(
+            [4*self.kernel_size, b, h, w],
+            dtype=feature.dtype,
+            requires_grad=feature.requires_grad,
+        )
+        if feature.is_cuda:
+            output = output.cuda(feature.get_device())
+        # left-top to right-bottom
+        for i in range(self.kernel_size):
+            c=i
+            r=i
+            output[i] = (feature_pad[:, :, r : (h + r), c : (w + c)] * feature_normalized).sum(1)
+        # col
+        for i in range(self.kernel_size):
+            c=self.kernel_size//2
+            r=i
+            output[1*self.kernel_size+i] = (feature_pad[:, :, r : (h + r), c : (w + c)] * feature_normalized).sum(1)
+        # right-top to left-bottom
+        for i in range(self.kernel_size):
+            c=(self.kernel_size-1)-i
+            r=i
+            output[2*self.kernel_size+i] = (feature_pad[:, :, r : (h + r), c : (w + c)] * feature_normalized).sum(1)
+        # row
+        for i in range(self.kernel_size):
+            c=i
+            r=self.kernel_size//2
+            output[3*self.kernel_size+i] = (feature_pad[:, :, r : (h + r), c : (w + c)] * feature_normalized).sum(1)
+        output = output.transpose(0, 1).contiguous()
+        output = torch.cat((feature, output), 1)
+        output = self.conv(output)
+        # output = F.relu(output)
+        return output
+def c2_msra_fill(module: nn.Module) -> None:
+    """
+    Initialize `module.weight` using the "MSRAFill" implemented in Caffe2.
+    Also initializes `module.bias` to 0.
+    Args:
+        module (torch.nn.Module): module to initialize.
+    """
+    nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+    if module.bias is not None:
+        nn.init.constant_(module.bias, 0)
+class Conv2d(nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="GN",
+        stride_in_1x1=False,
+        dilation=1,
+        num_norm_groups=32,
+        kernel_size = (1,3,1)
+    ):
+        """
+        Args:
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            num_groups (int): number of groups for the 3x3 conv layer.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            stride_in_1x1 (bool): when stride>1, whether to put stride in the
+                first 1x1 convolution or the bottleneck 3x3 convolution.
+            dilation (int): the dilation rate of the 3x3 conv layer.
+        """
+        super().__init__(in_channels, out_channels, stride)
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias=False,
+                norm=get_norm(norm, out_channels, num_norm_groups),
+            )
+        else:
+            self.shortcut = None
+        # The original MSRA ResNet models have stride in the first 1x1 conv
+        # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have
+        # stride in the 3x3 conv
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=kernel_size[0],
+            stride=stride_1x1,
+            padding=(kernel_size[0]-1)//2,
+            bias=False,
+            norm=get_norm(norm, bottleneck_channels, num_norm_groups),
+        )
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=kernel_size[1],
+            stride=stride_3x3,
+            padding=dilation*(kernel_size[1]-1)//2,
+            bias=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels, num_norm_groups),
+        )
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=kernel_size[2],
+            bias=False,
+            norm=get_norm(norm, out_channels, num_norm_groups),
+        )
+        for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]:
+            if layer is not None:  # shortcut can be None
+                c2_msra_fill(layer)
+        # Zero-initialize the last normalization in each residual branch,
+        # so that at the beginning, the residual branch starts with zeros,
+        # and each residual block behaves like an identity.
+        # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+        # "For BN layers, the learnable scaling coefficient γ is initialized
+        # to be 1, except for each residual block's last BN
+        # where γ is initialized to be 0."
+        # nn.init.constant_(self.conv3.norm.weight, 0)
+        # TODO this somehow hurts performance when training GN models from scratch.
+        # Add it as an option when we need to use this code to train a backbone.
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu_(out)
+        out = self.conv2(out)
+        out = F.relu_(out)
+        out = self.conv3(out)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+        out += shortcut
+        out = F.relu_(out)
+        return out
+class ResNet(nn.Module):
+    """
+    Implement :paper:`ResNet`.
+    """
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        """
+        Args:
+            stem (nn.Module): a stem module
+            stages (list[list[CNNBlockBase]]): several (typically 4) stages,
+                each contains multiple :class:`CNNBlockBase`.
+            num_classes (None or int): if None, will not perform classification.
+                Otherwise, will create a linear layer.
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "linear", or "res2" ...
+                If None, will return the output of the last layer.
+            freeze_at (int): The number of stages at the beginning to freeze.
+                see :meth:`freeze` for detailed explanation.
+        """
+        super().__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+        self.stage_names, self.stages = [], []
+        if out_features is not None:
+            # Avoid keeping unused layers in this module. They consume extra memory
+            # and may cause allreduce to fail
+            num_stages = max(
+                [{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features]
+            )
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+            self.add_module(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)  # Make it static for scripting
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.linear = nn.Linear(curr_channels, num_classes)
+            # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour":
+            # "The 1000-way fully-connected layer is initialized by
+            # drawing weights from a zero-mean Gaussian with standard deviation of 0.01."
+            nn.init.normal_(self.linear.weight, std=0.01)
+            name = "linear"
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+    def freeze(self, freeze_at=0):
+        """
+        Freeze the first several stages of the ResNet. Commonly used in
+        fine-tuning.
+        Layers that produce the same feature map spatial size are defined as one
+        "stage" by :paper:`FPN`.
+        Args:
+            freeze_at (int): number of stages to freeze.
+                `1` means freezing the stem. `2` means freezing the stem and
+                one residual stage, etc.
+        Returns:
+            nn.Module: this ResNet itself
+        """
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the "
+                        f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+            blocks.append(
+                block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs)
+            )
+            in_channels = out_channels
+        return blocks
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy
+torch
+torchvision
+pillow
+gradio
+matplotlib