Spaces:

Yiming-M
/

ZIP

Running on Zero

App Files Files Community

Yiming-M commited on Jul 31

Commit

a7dedf9

1 Parent(s): 2a347d3

2025-07-31 18:59 🐣

Browse files

Files changed (28) hide show

.gitignore +163 -0
README.md +5 -5
app.py +459 -0
models/__init__.py +155 -0
models/clip_ebc/__init__.py +7 -0
models/clip_ebc/convnext.py +199 -0
models/clip_ebc/mobileclip.py +197 -0
models/clip_ebc/model.py +272 -0
models/clip_ebc/resnet.py +236 -0
models/clip_ebc/utils.py +137 -0
models/clip_ebc/vit.py +372 -0
models/ebc/__init__.py +3 -0
models/ebc/cannet.py +105 -0
models/ebc/csrnet.py +104 -0
models/ebc/hrnet.py +195 -0
models/ebc/model.py +199 -0
models/ebc/timm_models.py +318 -0
models/ebc/utils.py +37 -0
models/ebc/vgg.py +255 -0
models/ebc/vit.py +323 -0
models/utils/__init__.py +56 -0
models/utils/blocks.py +617 -0
models/utils/carafe.py +203 -0
models/utils/downsample.py +239 -0
models/utils/multi_scale.py +112 -0
models/utils/refine.py +103 -0
models/utils/upsample.py +118 -0
models/utils/utils.py +77 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# MacOS
+**/.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: ZIP B
-emoji: 🦀
-colorFrom: yellow
-colorTo: red
 sdk: gradio
 sdk_version: 5.39.0
 app_file: app.py
-pinned: false
 license: mit
 short_description: The crowd counting model ZIP-B
 ---

 ---
+title: ZIP
+emoji: 🔢
+colorFrom: indigo
+colorTo: pink
 sdk: gradio
 sdk_version: 5.39.0
 app_file: app.py
+pinned: true
 license: mit
 short_description: The crowd counting model ZIP-B
 ---

app.py ADDED Viewed

	@@ -0,0 +1,459 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from torch import Tensor
+import spaces
+import numpy as np
+from PIL import Image
+import gradio as gr
+from matplotlib import cm
+from huggingface_hub import hf_hub_download
+from warnings import warn
+from models import get_model
+mean = (0.485, 0.456, 0.406)
+std = (0.229, 0.224, 0.225)
+alpha = 0.8
+EPS = 1e-8
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pretrained_datasets = {
+    "ZIP-B": ["ShanghaiTech A", "ShanghaiTech B", "UCF-QNRF" "NWPU-Crowd"],
+    "ZIP-S": ["ShanghaiTech A", "ShanghaiTech B", "UCF-QNRF"],
+    "ZIP-T": ["ShanghaiTech A", "ShanghaiTech B", "UCF-QNRF"],
+    "ZIP-N": ["ShanghaiTech A", "ShanghaiTech B", "UCF-QNRF"],
+    "ZIP-P": ["ShanghaiTech A", "ShanghaiTech B", "UCF-QNRF"],
+}
+# -----------------------------
+# Define the model architecture
+# -----------------------------
+def load_model(variant: str, dataset: str = "ShanghaiTech B", metric: str = "mae"):
+    """ Load the model weights from the Hugging Face Hub."""
+    global loaded_model
+    # Build model
+    model_info_path = hf_hub_download(
+        repo_id=f"Yiming-M/{variant}",
+        filename=f"checkpoints/{dataset}/best_{metric}.pth",
+    )
+    model = get_model(model_info_path=model_info_path)
+    model.eval()
+    loaded_model = model
+def _calc_size(
+    img_w: int,
+    img_h: int,
+    min_size: int,
+    max_size: int,
+    base: int = 32
+):
+    """
+    This function generates a new size for an image while keeping the aspect ratio. The new size should be within the given range (min_size, max_size).
+    Args:
+        img_w (int): The width of the image.
+        img_h (int): The height of the image.
+        min_size (int): The minimum size of the edges of the image.
+        max_size (int): The maximum size of the edges of the image.
+        # base (int): The base number to which the new size should be a multiple of.
+    """
+    assert min_size % base == 0, f"min_size ({min_size}) must be a multiple of {base}"
+    if max_size != float("inf"):
+        assert max_size % base == 0, f"max_size ({max_size}) must be a multiple of {base} if provided"
+    assert min_size <= max_size, f"min_size ({min_size}) must be less than or equal to max_size ({max_size})"
+    aspect_ratios = (img_w / img_h, img_h / img_w)
+    if min_size / max_size <= min(aspect_ratios) <= max(aspect_ratios) <= max_size / min_size:  # possible to resize and preserve the aspect ratio
+        if min_size <= min(img_w, img_h) <= max(img_w, img_h) <= max_size:  # already within the range, no need to resize
+            ratio = 1.
+        elif min(img_w, img_h) < min_size:  # smaller than the minimum size, resize to the minimum size
+            ratio = min_size / min(img_w, img_h)
+        else:  # larger than the maximum size, resize to the maximum size
+            ratio = max_size / max(img_w, img_h)
+        new_w, new_h = int(round(img_w * ratio / base) * base), int(round(img_h * ratio / base) * base)
+        new_w = max(min_size, min(max_size, new_w))
+        new_h = max(min_size, min(max_size, new_h))
+        return new_w, new_h
+    else:  # impossible to resize and preserve the aspect ratio
+        msg = f"Impossible to resize {img_w}x{img_h} image while preserving the aspect ratio to a size within the range ({min_size}, {max_size}). Will not limit the maximum size."
+        warn(msg)
+        return _calc_size(img_w, img_h, min_size, float("inf"), base)
+# -----------------------------
+# Preprocessing function
+# -----------------------------
+# Adjust the image transforms to match what your model expects.
+def transform(image: Image.Image, dataset_name: str) -> Tensor:
+    assert isinstance(image, Image.Image), "Input must be a PIL Image"
+    image_tensor = TF.to_tensor(image)
+    if dataset_name == "sha":
+        min_size = 448
+        max_size = float("inf")
+    elif dataset_name == "shb":
+        min_size = 448
+        max_size = float("inf")
+    elif dataset_name == "qnrf":
+        min_size = 448
+        max_size = 2048
+    elif dataset_name == "nwpu":
+        min_size = 448
+        max_size = 3072
+    image_height, image_width = image_tensor.shape[-2:]
+    new_width, new_height = _calc_size(
+        img_w=image_width,
+        img_h=image_height,
+        min_size=min_size,
+        max_size=max_size,
+        base=32
+    )
+    if new_height != image_height or new_width != image_width:
+        image_tensor = TF.resize(image_tensor, size=(new_height, new_width), interpolation=TF.InterpolationMode.LANCZOS, antialias=True)
+    image_tensor = TF.normalize(image_tensor, mean=mean, std=std)
+    return image_tensor.unsqueeze(0)  # Add batch dimension
+def _sliding_window_predict(
+    model: nn.Module,
+    image: Tensor,
+    window_size: int,
+    stride: int,
+    max_num_windows: int = 256
+):
+    assert len(image.shape) == 4, f"Image must be a 4D tensor (1, c, h, w), got {image.shape}"
+    window_size = (int(window_size), int(window_size)) if isinstance(window_size, (int, float)) else window_size
+    stride = (int(stride), int(stride)) if isinstance(stride, (int, float)) else stride
+    window_size = tuple(window_size)
+    stride = tuple(stride)
+    assert isinstance(window_size, tuple) and len(window_size) == 2 and window_size[0] > 0 and window_size[1] > 0, f"Window size must be a positive integer tuple (h, w), got {window_size}"
+    assert isinstance(stride, tuple) and len(stride) == 2 and stride[0] > 0 and stride[1] > 0, f"Stride must be a positive integer tuple (h, w), got {stride}"
+    assert stride[0] <= window_size[0] and stride[1] <= window_size[1], f"Stride must be smaller than window size, got {stride} and {window_size}"
+    image_height, image_width = image.shape[-2:]
+    window_height, window_width = window_size
+    assert image_height >= window_height and image_width >= window_width, f"Image size must be larger than window size, got image size {image.shape} and window size {window_size}"
+    stride_height, stride_width = stride
+    num_rows = int(np.ceil((image_height - window_height) / stride_height) + 1)
+    num_cols = int(np.ceil((image_width - window_width) / stride_width) + 1)
+    if hasattr(model, "block_size"):
+        block_size = model.block_size
+    elif hasattr(model, "module") and hasattr(model.module, "block_size"):
+        block_size = model.module.block_size
+    else:
+        raise ValueError("Model must have block_size attribute")
+    assert window_height % block_size == 0 and window_width % block_size == 0, f"Window size must be divisible by block size, got {window_size} and {block_size}"
+    windows = []
+    for i in range(num_rows):
+        for j in range(num_cols):
+            x_start, y_start = i * stride_height, j * stride_width
+            x_end, y_end = x_start + window_height, y_start + window_width
+            if x_end > image_height:
+                x_start, x_end = image_height - window_height, image_height
+            if y_end > image_width:
+                y_start, y_end = image_width - window_width, image_width
+            window = image[:, :, x_start:x_end, y_start:y_end]
+            windows.append(window)
+    windows = torch.cat(windows, dim=0).to(image.device)  # batched windows, shape: (num_windows, c, h, w)
+    model.eval()
+    pi_maps, lambda_maps = [], []
+    for i in range(0, len(windows), max_num_windows):
+        with torch.no_grad():
+            image_feats = model.backbone(windows[i: min(i + max_num_windows, len(windows))])
+            pi_image_feats, lambda_image_feats = model.pi_head(image_feats), model.lambda_head(image_feats)
+            pi_image_feats = F.normalize(pi_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            lambda_image_feats = F.normalize(lambda_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            pi_text_feats, lambda_text_feats = model.pi_text_feats, model.lambda_text_feats
+            pi_logit_scale, lambda_logit_scale = model.pi_logit_scale.exp(), model.lambda_logit_scale.exp()
+            pi_logit_map = pi_logit_scale * pi_image_feats @ pi_text_feats.t()  # (B, H, W, 2), logits per image
+            lambda_logit_map = lambda_logit_scale * lambda_image_feats @ lambda_text_feats.t()  # (B, H, W, N - 1), logits per image
+            pi_logit_map =  pi_logit_map.permute(0, 3, 1, 2)  # (B, 2, H, W)
+            lambda_logit_map = lambda_logit_map.permute(0, 3, 1, 2)  # (B, N - 1, H, W)
+            lambda_map = (lambda_logit_map.softmax(dim=1) * model.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            pi_map = pi_logit_map.softmax(dim=1)[:, 0:1]  # (B, 1, H, W)
+            pi_maps.append(pi_map.cpu().numpy())
+            lambda_maps.append(lambda_map.cpu().numpy())
+    # assemble the density map
+    pi_maps = np.concatenate(pi_maps, axis=0)  # shape: (num_windows, 1, H, W)
+    lambda_maps = np.concatenate(lambda_maps, axis=0)  # shape: (num_windows, 1, H, W)
+    assert pi_maps.shape == lambda_maps.shape, f"pi_maps and lambda_maps must have the same shape, got {pi_maps.shape} and {lambda_maps.shape}"
+    pi_map = np.zeros((pi_maps.shape[1], image_height // block_size, image_width // block_size), dtype=np.float32)
+    lambda_map = np.zeros((lambda_maps.shape[1], image_height // block_size, image_width // block_size), dtype=np.float32)
+    count_map = np.zeros((pi_maps.shape[1], image_height // block_size, image_width // block_size), dtype=np.float32)
+    idx = 0
+    for i in range(num_rows):
+        for j in range(num_cols):
+            x_start, y_start = i * stride_height, j * stride_width
+            x_end, y_end = x_start + window_height, y_start + window_width
+            if x_end > image_height:
+                x_start, x_end = image_height - window_height, image_height
+            if y_end > image_width:
+                y_start, y_end = image_width - window_width, image_width
+            pi_map[:, (x_start // block_size): (x_end // block_size), (y_start // block_size): (y_end // block_size)] += pi_maps[idx, :, :, :]
+            lambda_map[:, (x_start // block_size): (x_end // block_size), (y_start // block_size): (y_end // block_size)] += lambda_maps[idx, :, :, :]
+            count_map[:, (x_start // block_size): (x_end // block_size), (y_start // block_size): (y_end // block_size)] += 1.
+            idx += 1
+    # average the density map
+    pi_map /= count_map
+    lambda_map /= count_map
+    # convert to Tensor and reshape
+    pi_map = torch.from_numpy(pi_map).unsqueeze(0)  # shape: (1, 1, H // block_size, W // block_size)
+    lambda_map = torch.from_numpy(lambda_map).unsqueeze(0)  # shape: (1, 1, H // block_size, W // block_size)
+    return pi_map, lambda_map
+# -----------------------------
+# Inference function
+# -----------------------------
+@spaces.GPU(duration=120)
+def predict(image: Image.Image, variant: str, dataset: str, metric: str):
+    """
+    Given an input image, preprocess it, run the model to obtain a density map,
+    compute the total crowd count, and prepare the density map for display.
+    """
+    global loaded_model
+    if loaded_model is None:
+        if dataset == "ShanghaiTech A":
+            dataset_name = "sha"
+        elif dataset == "ShanghaiTech B":
+            dataset_name = "shb"
+        elif dataset == "UCF-QNRF":
+            dataset_name = "qnrf"
+        elif dataset == "NWPU-Crowd":
+            dataset_name = "nwpu"
+        weight_path = f"Yiming-M/{variant}/checkpoints/{dataset_name}/best_{metric}.pth"
+        load_model(weight_path)
+    loaded_model.to(device)
+    # Preprocess the image
+    input_width, input_height = image.size
+    image_tensor = transform(image, dataset_name).to(device)  # shape: (1, 3, H, W)
+    input_size = loaded_model.input_size
+    image_height, image_width = image_tensor.shape[-2:]
+    aspect_ratio = image_width / image_height
+    if image_height < input_size:
+        new_height = input_size
+        new_width = int(new_height * aspect_ratio)
+        image_tensor = F.interpolate(image_tensor, size=(new_height, new_width), mode="bicubic", align_corners=False, antialias=True)
+        image_height, image_width = new_height, new_width
+    if image_width < input_size:
+        new_width = input_size
+        new_height = int(new_width / aspect_ratio)
+        image_tensor = F.interpolate(image_tensor, size=(new_height, new_width), mode="bicubic", align_corners=False, antialias=True)
+        image_height, image_width = new_height, new_width
+    with torch.no_grad():
+        if hasattr(loaded_model, "num_vpt") and loaded_model.num_vpt > 0:  # For ViT models, use sliding window prediction
+            # For ViT models with VPT
+            pi_map, lambda_map = _sliding_window_predict(
+                model=loaded_model,
+                image=image_tensor,
+                window_size=input_size,
+                stride=input_size
+            )
+        elif hasattr(loaded_model, "pi_text_feats") and hasattr(loaded_model, "lambda_text_feats") and loaded_model.pi_text_feats is not None and loaded_model.lambda_text_feats is not None:  # For other CLIP-based models
+            image_feats = loaded_model.backbone(image_tensor)
+            # image_feats = F.normalize(image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            pi_image_feats, lambda_image_feats = loaded_model.pi_head(image_feats), loaded_model.lambda_head(image_feats)
+            pi_image_feats = F.normalize(pi_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            lambda_image_feats = F.normalize(lambda_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            pi_text_feats, lambda_text_feats = loaded_model.pi_text_feats, loaded_model.lambda_text_feats
+            pi_logit_scale, lambda_logit_scale = loaded_model.pi_logit_scale.exp(), loaded_model.lambda_logit_scale.exp()
+            pi_logit_map = pi_logit_scale * pi_image_feats @ pi_text_feats.t()  # (B, H, W, 2), logits per image
+            lambda_logit_map = lambda_logit_scale * lambda_image_feats @ lambda_text_feats.t()  # (B, H, W, N - 1), logits per image
+            pi_logit_map =  pi_logit_map.permute(0, 3, 1, 2)  # (B, 2, H, W)
+            lambda_logit_map = lambda_logit_map.permute(0, 3, 1, 2)  # (B, N - 1, H, W)
+            lambda_map = (lambda_logit_map.softmax(dim=1) * loaded_model.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            pi_map = pi_logit_map.softmax(dim=1)[:, 0:1]  # (B, 1, H, W)
+        else: # For non-CLIP models
+            x = loaded_model.backbone(image_tensor)
+            logit_pi_map = loaded_model.pi_head(x)  # shape: (B, 2, H, W)
+            logit_map = loaded_model.bin_head(x)  # shape: (B, C, H, W)
+            lambda_map= (logit_map.softmax(dim=1) * loaded_model.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # shape: (B, 1, H, W)
+            pi_map = logit_pi_map.softmax(dim=1)[:, 0:1]  # shape: (B, 1, H, W)
+        den_map = (1.0 - pi_map) * lambda_map  # shape: (B, 1, H, W)
+        count = den_map.sum().item()
+        strucrual_zero_map = F.interpolate(
+            pi_map, size=(input_height, input_width), mode="bilinear", align_corners=False, antialias=True
+        ).cpu().squeeze().numpy()
+        lambda_map = F.interpolate(
+            lambda_map, size=(input_height, input_width), mode="bilinear", align_corners=False, antialias=True
+        ).cpu().squeeze().numpy()
+        den_map = F.interpolate(
+            den_map, size=(input_height, input_width), mode="bilinear", align_corners=False, antialias=True
+        ).cpu().squeeze().numpy()
+    sampling_zero_map = (1.0 - strucrual_zero_map) * np.exp(-lambda_map)
+    complete_zero_map = strucrual_zero_map + sampling_zero_map
+    # Normalize maps for display purposes
+    def normalize_map(x: np.ndarray) -> np.ndarray:
+        """ Normalize the map to [0, 1] range for visualization. """
+        x_min = np.min(x)
+        x_max = np.max(x)
+        if x_max - x_min < EPS:
+            return np.zeros_like(x)
+        return (x - x_min) / (x_max - x_min + EPS)
+    strucrual_zero_map = normalize_map(strucrual_zero_map)
+    sampling_zero_map = normalize_map(sampling_zero_map)
+    lambda_map = normalize_map(lambda_map)
+    den_map = normalize_map(den_map)
+    complete_zero_map = normalize_map(complete_zero_map)
+    # Apply a colormap (e.g., 'jet') to get an RGBA image
+    colormap = cm.get_cmap("jet")
+    # The colormap returns values in [0,1]. Scale to [0,255] and convert to uint8.
+    den_map = (colormap(den_map) * 255).astype(np.uint8)
+    strucrual_zero_map = (colormap(strucrual_zero_map) * 255).astype(np.uint8)
+    sampling_zero_map = (colormap(sampling_zero_map) * 255).astype(np.uint8)
+    lambda_map = (colormap(lambda_map) * 255).astype(np.uint8)
+    complete_zero_map = (colormap(complete_zero_map) * 255).astype(np.uint8)
+    # Convert to PIL images
+    den_map = Image.fromarray(den_map).convert("RGBA")
+    strucrual_zero_map = Image.fromarray(strucrual_zero_map).convert("RGBA")
+    sampling_zero_map = Image.fromarray(sampling_zero_map).convert("RGBA")
+    lambda_map = Image.fromarray(lambda_map).convert("RGBA")
+    complete_zero_map = Image.fromarray(complete_zero_map).convert("RGBA")
+    # Ensure the original image is in RGBA format.
+    image_rgba = image.convert("RGBA")
+    den_map = Image.blend(image_rgba, den_map, alpha=alpha)
+    strucrual_zero_map = Image.blend(image_rgba, strucrual_zero_map, alpha=alpha)
+    sampling_zero_map = Image.blend(image_rgba, sampling_zero_map, alpha=alpha)
+    lambda_map = Image.blend(image_rgba, lambda_map, alpha=alpha)
+    complete_zero_map = Image.blend(image_rgba, complete_zero_map, alpha=alpha)
+    return image, strucrual_zero_map, sampling_zero_map, complete_zero_map, lambda_map, den_map, f"Predicted Count: {count:.2f}"
+# -----------------------------
+# Build Gradio Interface using Blocks for a two-column layout
+# -----------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# Crowd Counting by ZIP")
+    gr.Markdown("Upload an image or select an example below to see the predicted crowd density map and total count.")
+    with gr.Row():
+        with gr.Column():
+            # Dropdown for model variant
+            variant_dropdown = gr.Dropdown(
+                choices=list(pretrained_datasets.keys()),
+                value="ZIP-B",
+                label="Select Model Variant"
+            )
+            # Dropdown for pretrained dataset, dynamically updated based on variant
+            dataset_dropdown = gr.Dropdown(
+                choices=pretrained_datasets["ZIP-B"],
+                value=pretrained_datasets["ZIP-B"][0],
+                label="Select Pretrained Dataset"
+            )
+            # Dropdown for metric, always the same choices
+            metric_dropdown = gr.Dropdown(
+                choices=["mae", "rmse", "nae"],
+                value="mae",
+                label="Select Best Metric"
+            )
+            # Update dataset choices when variant changes
+            def update_dataset(variant):
+                choices = pretrained_datasets[variant]
+                return gr.Dropdown.update(
+                    choices=choices,
+                    value=choices[0]
+                )
+            variant_dropdown.change(
+                fn=update_dataset,
+                inputs=variant_dropdown,
+                outputs=dataset_dropdown
+            )
+            input_img = gr.Image(label="Input Image", sources=["upload", "clipboard"], type="pil")
+            submit_btn = gr.Button("Predict")
+        with gr.Column():
+            output_den_map = gr.Image(label="Predicted Density Map", type="pil")
+            output_structural_zero_map = gr.Image(label="Structural Zero Map", type="pil")
+            output_sampling_zero_map = gr.Image(label="Sampling Zero Map", type="pil")
+            output_lambda_map = gr.Image(label="Lambda Map", type="pil")
+            output_complete_zero_map = gr.Image(label="Complete Zero Map", type="pil")
+            output_text = gr.Textbox(label="Total Count")
+    submit_btn.click(
+        fn=predict,
+        inputs=[input_img, variant_dropdown, dataset_dropdown, metric_dropdown],
+        outputs=[input_img, output_structural_zero_map, output_sampling_zero_map, output_complete_zero_map, output_lambda_map, output_den_map, output_text]
+    )
+    gr.Examples(
+        examples=[
+            ["example1.jpg"],
+            ["example2.jpg"],
+            ["example3.jpg"],
+            ["example4.jpg"],
+            ["example5.jpg"],
+            ["example6.jpg"],
+            ["example7.jpg"],
+            ["example8.jpg"],
+            ["example9.jpg"],
+            ["example10.jpg"],
+            ["example11.jpg"],
+            ["example12.jpg"]
+        ],
+        inputs=input_img,
+        label="Try an example"
+    )
+demo.launch()

models/__init__.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os, torch
+from typing import List, Tuple, Optional, Union, Dict
+from .ebc import _ebc, EBC
+from .clip_ebc import _clip_ebc, CLIP_EBC
+def get_model(
+    model_info_path: str,
+    model_name: Optional[str] = None,
+    block_size: Optional[int] = None,
+    bins: Optional[List[Tuple[float, float]]] = None,
+    bin_centers: Optional[List[float]] = None,
+    zero_inflated: Optional[bool] = True,
+    # parameters for CLIP_EBC
+    clip_weight_name: Optional[str] = None,
+    num_vpt: Optional[int] = None,
+    vpt_drop: Optional[float] = None,
+    input_size: Optional[int] = None,
+    adapter: bool = False,
+    adapter_reduction: Optional[int] = None,
+    lora: bool = False,
+    lora_rank: Optional[int] = None,
+    lora_alpha: Optional[int] = None,
+    lora_dropout: Optional[float] = None,
+    norm: str = "none",
+    act: str = "none",
+    text_prompts: Optional[List[str]] = None
+) -> Union[EBC, CLIP_EBC]:
+    if os.path.exists(model_info_path):
+        model_info = torch.load(model_info_path, map_location="cpu", weights_only=False)
+        model_name = model_info["config"]["model_name"]
+        block_size = model_info["config"]["block_size"]
+        bins = model_info["config"]["bins"]
+        bin_centers = model_info["config"]["bin_centers"]
+        zero_inflated = model_info["config"]["zero_inflated"]
+        clip_weight_name = model_info["config"].get("clip_weight_name", None)
+        num_vpt = model_info["config"].get("num_vpt", None)
+        vpt_drop = model_info["config"].get("vpt_drop", None)
+        adapter = model_info["config"].get("adapter", False)
+        adapter_reduction = model_info["config"].get("adapter_reduction", None)
+        lora = model_info["config"].get("lora", False)
+        lora_rank = model_info["config"].get("lora_rank", None)
+        lora_alpha = model_info["config"].get("lora_alpha", None)
+        lora_dropout = model_info["config"].get("lora_dropout", None)
+        input_size = model_info["config"].get("input_size", None)
+        text_prompts = model_info["config"].get("text_prompts", None)
+        norm = model_info["config"].get("norm", "none")
+        act = model_info["config"].get("act", "none")
+        weights = model_info["weights"]
+    else:
+        assert model_name is not None, "model_name should be provided if model_info_path is not provided"
+        assert block_size is not None, "block_size should be provided"
+        assert bins is not None, "bins should be provided"
+        assert bin_centers is not None, "bin_centers should be provided"
+        weights = None
+    if "ViT" in model_name:
+        assert num_vpt is not None, f"num_vpt should be provided for ViT models, got {num_vpt}"
+        assert vpt_drop is not None, f"vpt_drop should be provided for ViT models, got {vpt_drop}"
+    if model_name.startswith("CLIP_") or model_name.startswith("CLIP-"):
+        assert clip_weight_name is not None, f"clip_weight_name should be provided for CLIP models, got {clip_weight_name}"
+        model = _clip_ebc(
+            model_name=model_name[5:],
+            weight_name=clip_weight_name,
+            block_size=block_size,
+            bins=bins,
+            bin_centers=bin_centers,
+            zero_inflated=zero_inflated,
+            num_vpt=num_vpt,
+            vpt_drop=vpt_drop,
+            input_size=input_size,
+            adapter=adapter,
+            adapter_reduction=adapter_reduction,
+            lora=lora,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            text_prompts=text_prompts,
+            norm=norm,
+            act=act
+        )
+        model_config = {
+            "model_name": model_name,
+            "block_size": block_size,
+            "bins": bins,
+            "bin_centers": bin_centers,
+            "zero_inflated": zero_inflated,
+            "clip_weight_name": clip_weight_name,
+            "num_vpt": num_vpt,
+            "vpt_drop": vpt_drop,
+            "input_size": input_size,
+            "adapter": adapter,
+            "adapter_reduction": adapter_reduction,
+            "lora": lora,
+            "lora_rank": lora_rank,
+            "lora_alpha": lora_alpha,
+            "lora_dropout": lora_dropout,
+            "text_prompts": model.text_prompts,
+            "norm": norm,
+            "act": act
+        }
+    else:
+        assert not adapter, "adapter for non-CLIP models is not implemented yet"
+        assert not lora, "lora for non-CLIP models is not implemented yet"
+        model = _ebc(
+            model_name=model_name,
+            block_size=block_size,
+            bins=bins,
+            bin_centers=bin_centers,
+            zero_inflated=zero_inflated,
+            num_vpt=num_vpt,
+            vpt_drop=vpt_drop,
+            input_size=input_size,
+            norm=norm,
+            act=act
+        )
+        model_config = {
+            "model_name": model_name,
+            "block_size": block_size,
+            "bins": bins,
+            "bin_centers": bin_centers,
+            "zero_inflated": zero_inflated,
+            "num_vpt": num_vpt,
+            "vpt_drop": vpt_drop,
+            "input_size": input_size,
+            "norm": norm,
+            "act": act
+        }
+    model.config = model_config
+    model_info = {"config": model_config, "weights": weights}
+    if weights is not None:
+        model.load_state_dict(weights)
+    if not os.path.exists(model_info_path):
+        torch.save(model_info, model_info_path)
+    return model
+__all__ = ["get_model"]

models/clip_ebc/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .model import CLIP_EBC, _clip_ebc
+__all__ = [
+    "CLIP_EBC",
+    "_clip_ebc",
+]

models/clip_ebc/convnext.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvAdapter
+from ..utils import ConvUpsample, _get_norm_layer, _get_activation
+convnext_names_and_weights = {
+    "convnext_base": ["laion400m_s13b_b51k"],  # 107.49M
+    "convnext_base_w": ["laion2b_s13b_b82k", "laion2b_s13b_b82k_augreg", "laion_aesthetic_s13b_b82k"],  # 107.75M
+    "convnext_base_w_320": ["laion_aesthetic_s13b_b82k", "laion_aesthetic_s13b_b82k_augreg"],  # 107.75M
+    "convnext_large_d": ["laion2b_s26b_b102k_augreg"],  # 217.46M
+    "convnext_large_d_320": ["laion2b_s29b_b131k_ft", "laion2b_s29b_b131k_ft_soup"],  # 217.46M
+    "convnext_xxlarge": ["laion2b_s34b_b82k_augreg", "laion2b_s34b_b82k_augreg_rewind", "laion2b_s34b_b82k_augreg_soup"]  # 896.88M
+}
+refiner_channels = {
+    "convnext_base": 1024,
+    "convnext_base_w": 1024,
+    "convnext_base_w_320": 1024,
+    "convnext_large_d": 1536,
+    "convnext_large_d_320": 1536,
+    "convnext_xxlarge": 3072,
+}
+refiner_groups = {
+    "convnext_base": 1,
+    "convnext_base_w": 1,
+    "convnext_base_w_320": 1,
+    "convnext_large_d": refiner_channels["convnext_large_d"] // 512,  # 3
+    "convnext_large_d_320": refiner_channels["convnext_large_d_320"] // 512,  # 3
+    "convnext_xxlarge": refiner_channels["convnext_xxlarge"] // 512,  # 6
+}
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ConvNeXt, self).__init__()
+        assert model_name in convnext_names_and_weights, f"Model name should be one of {list(convnext_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in convnext_names_and_weights[model_name], f"Pretrained should be one of {convnext_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            self.adapter_reduction = adapter_reduction
+            for param in model.parameters():
+                param.requires_grad = False
+        self.stem = model.trunk.stem
+        self.depth = len(model.trunk.stages)
+        for idx, stage in enumerate(model.trunk.stages):
+            setattr(self, f"stage{idx}", stage)
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=stage.blocks[-1].mlp.fc2.out_features,
+                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_features // adapter_reduction,
+                ) if idx < self.depth - 1 else nn.Identity())  # No adapter for the last stage
+        if self.model_name in ["convnext_base", "convnext_base_w", "convnext_base_w_320", "convnext_xxlarge"]:
+            self.in_features, self.out_features = model.head.proj.in_features, model.head.proj.out_features
+        else:  # "convnext_large_d", "convnext_large_d_320":
+            self.in_features, self.out_features = model.head.mlp.fc1.in_features, model.head.mlp.fc2.out_features
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.stem.eval()
+            for idx in range(self.depth):
+                getattr(self, f"stage{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for idx in range(self.depth):
+            x = getattr(self, f"stage{idx}")(x)
+            if self.adapter:
+                x = getattr(self, f"adapter{idx}")(x)
+        x = self.refiner(x)
+        return x
+def _convnext(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> ConvNeXt:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = ConvNeXt(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)) and "refiner" not in name:
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/mobileclip.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvUpsample, ConvAdapter
+from ..utils import _get_norm_layer, _get_activation
+mobileclip_names_and_weights = {
+    "MobileCLIP-S1": ["datacompdr"],
+    "MobileCLIP-S2": ["datacompdr"],
+}
+refiner_channels = {
+    "MobileCLIP-S1": 1024,
+    "MobileCLIP-S2": 1280,
+}
+refiner_groups = {
+    "MobileCLIP-S1": 2,
+    "MobileCLIP-S2": 2,
+}
+class MobileCLIP(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in mobileclip_names_and_weights, f"Model name should be one of {list(mobileclip_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in mobileclip_names_and_weights[model_name], f"Pretrained should be one of {mobileclip_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            for param in model.parameters():
+                param.requires_grad = False
+        self.stem = model.trunk.stem
+        self.stages = model.trunk.stages
+        self.depth = len(model.trunk.stages)
+        for idx, stage in enumerate(model.trunk.stages):
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=stage.blocks[-1].mlp.fc2.out_channels,
+                    bottleneck_channels=stage.blocks[-1].mlp.fc2.out_channels // adapter_reduction,
+                ))
+        self.final_conv = model.trunk.final_conv
+        self.in_features, self.out_features = model.trunk.head.fc.in_features, model.trunk.head.fc.out_features
+        # refine_block = LightConvRefine if model_name == "MobileCLIP-S1" else ConvRefine
+        # upsample_block = LightConvUpsample if model_name == "MobileCLIP-S1" else ConvUpsample
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.stem.eval()
+            for idx in range(self.depth):
+                getattr(self, f"stage{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.final_conv.eval()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        for idx in range(self.depth):
+            x = self.stages[idx](x)
+            if self.adapter:
+                x = getattr(self, f"adapter{idx}")(x)
+        x = self.final_conv(x)
+        x = self.refiner(x)
+        return x
+def _mobileclip(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> MobileCLIP:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = MobileCLIP(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze the BN layers
+        for name, module in model.named_modules() and "refiner" not in name:
+            if isinstance(module, nn.BatchNorm2d):
+                module.requires_grad_(True)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/model.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Optional, Dict, Tuple
+from copy import deepcopy
+from .vit import vit_names_and_weights, _vit
+from .convnext import convnext_names_and_weights, _convnext
+from .resnet import resnet_names_and_weights, _resnet
+from .mobileclip import mobileclip_names_and_weights, _mobileclip
+from .utils import encode_text, optimize_text_prompts
+from ..utils import conv1x1
+supported_models_and_weights = deepcopy(vit_names_and_weights)
+supported_models_and_weights.update(convnext_names_and_weights)
+supported_models_and_weights.update(resnet_names_and_weights)
+supported_models_and_weights.update(mobileclip_names_and_weights)
+class CLIP_EBC(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: Optional[int] = None,
+        bins: Optional[List[Tuple[float, float]]] = None,
+        bin_centers: Optional[List[float]] = None,
+        zero_inflated: Optional[bool] = True,
+        num_vpt: Optional[int] = None,
+        vpt_drop: Optional[float] = None,
+        input_size: Optional[int] = None,
+        adapter: Optional[bool] = False,
+        adapter_reduction: Optional[int] = None,
+        lora: Optional[bool] = False,
+        lora_rank: Optional[int] = None,
+        lora_alpha: Optional[float] = None,
+        lora_dropout: Optional[float] = None,
+        text_prompts: Optional[Dict[str, List[str]]] = None,
+        norm: Optional[str] = "none",
+        act: Optional[str] = "none",
+    ) -> None:
+        super().__init__()
+        if "mobileclip" in model_name.lower() or "vit" in model_name.lower():
+            model_name = model_name.replace("_", "-")
+        assert model_name in supported_models_and_weights, f"Model name should be one of {list(supported_models_and_weights.keys())}, but got {model_name}."
+        assert weight_name in supported_models_and_weights[model_name], f"Pretrained should be one of {supported_models_and_weights[model_name]}, but got {weight_name}."
+        assert len(bins) == len(bin_centers), f"Expected bins and bin_centers to have the same length, got {len(bins)} and {len(bin_centers)}"
+        assert len(bins) >= 2, f"Expected at least 2 bins, got {len(bins)}"
+        assert all(len(b) == 2 for b in bins), f"Expected bins to be a list of tuples of length 2, got {bins}"
+        bins = [(float(b[0]), float(b[1])) for b in bins]
+        assert all(bin[0] <= p <= bin[1] for bin, p in zip(bins, bin_centers)), f"Expected bin_centers to be within the range of the corresponding bin, got {bins} and {bin_centers}"
+        self.model_name = model_name
+        self.weight_name = weight_name
+        self.block_size = block_size
+        self.bins = bins
+        self.register_buffer("bin_centers", torch.tensor(bin_centers, dtype=torch.float32, requires_grad=False).view(1, -1, 1, 1))
+        self.zero_inflated = zero_inflated
+        self.text_prompts = text_prompts
+        # Image encoder
+        if model_name in vit_names_and_weights:
+            assert num_vpt is not None and num_vpt >= 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+            vpt_drop = 0. if vpt_drop is None else vpt_drop
+            self.backbone = _vit(
+                model_name=model_name,
+                weight_name=weight_name,
+                num_vpt=num_vpt,
+                vpt_drop=vpt_drop,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                input_size=(input_size, input_size),
+                norm=norm,
+                act=act
+            )
+        elif model_name in convnext_names_and_weights:
+            self.backbone = _convnext(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        elif model_name in resnet_names_and_weights:
+            self.backbone = _resnet(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        elif model_name in mobileclip_names_and_weights:
+            self.backbone = _mobileclip(
+                model_name=model_name,
+                weight_name=weight_name,
+                block_size=block_size,
+                adapter=adapter,
+                adapter_reduction=adapter_reduction,
+                lora=lora,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                norm=norm,
+                act=act
+            )
+        self._build_text_feats()
+        self._build_head()
+    def _build_text_feats(self) -> None:
+        model_name, weight_name = self.model_name, self.weight_name
+        text_prompts = self.text_prompts
+        if text_prompts is None:
+            bins = [b[0] if b[0] == b[1] else b for b in self.bins]  # if the bin is a single value (e.g., [0, 0]), use that value
+            if self.zero_inflated:  # separate 0 from the rest
+                assert bins[0] == 0, f"Expected the first bin to be 0, got {bins[0]}."
+                bins_pi = [0, (1, float("inf"))]
+                bins_lambda = bins[1:]
+                pi_text_prompts = optimize_text_prompts(model_name, weight_name, bins_pi)
+                lambda_text_prompts = optimize_text_prompts(model_name, weight_name, bins_lambda)
+                self.text_prompts = {"pi": pi_text_prompts, "lambda": lambda_text_prompts}
+                pi_text_feats = encode_text(model_name, weight_name, pi_text_prompts)
+                lambda_text_feats = encode_text(model_name, weight_name, lambda_text_prompts)
+                pi_text_feats.requires_grad = False
+                lambda_text_feats.requires_grad = False
+                self.register_buffer("pi_text_feats", pi_text_feats)
+                self.register_buffer("lambda_text_feats", lambda_text_feats)
+            else:
+                text_prompts = optimize_text_prompts(model_name, weight_name, bins)
+                self.text_prompts = text_prompts
+                text_feats = encode_text(model_name, weight_name, text_prompts)
+                text_feats.requires_grad = False
+                self.register_buffer("text_feats", text_feats)
+        else:
+            if self.zero_inflated:
+                assert "pi" in text_prompts and "lambda" in text_prompts, f"Expected text_prompts to have keys 'pi' and 'lambda', got {text_prompts.keys()}."
+                pi_text_prompts = text_prompts["pi"]
+                lambda_text_prompts = text_prompts["lambda"]
+                pi_text_feats = encode_text(model_name, weight_name, pi_text_prompts)
+                lambda_text_feats = encode_text(model_name, weight_name, lambda_text_prompts)
+                pi_text_feats.requires_grad = False
+                lambda_text_feats.requires_grad = False
+                self.register_buffer("pi_text_feats", pi_text_feats)
+                self.register_buffer("lambda_text_feats", lambda_text_feats)
+            else:
+                text_feats = encode_text(model_name, weight_name, text_prompts)
+                text_feats.requires_grad = False
+                self.register_buffer("text_feats", text_feats)
+    def _build_head(self) -> None:
+        in_channels = self.backbone.in_features
+        out_channels = self.backbone.out_features
+        if self.zero_inflated:
+            self.pi_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.lambda_logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.pi_head = conv1x1(in_channels, out_channels, bias=False)
+            self.lambda_head = conv1x1(in_channels, out_channels, bias=False)
+        else:
+            self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07), requires_grad=True)
+            self.head = conv1x1(in_channels, out_channels, bias=False)
+    def forward(self, image: Tensor):
+        image_feats = self.backbone(image)
+        # image_feats = F.normalize(image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+        if self.zero_inflated:
+            pi_image_feats, lambda_image_feats = self.pi_head(image_feats), self.lambda_head(image_feats)
+            pi_image_feats = F.normalize(pi_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            lambda_image_feats = F.normalize(lambda_image_feats.permute(0, 2, 3, 1), p=2, dim=-1)  # shape (B, H, W, C)
+            pi_text_feats, lambda_text_feats = self.pi_text_feats, self.lambda_text_feats
+            pi_logit_scale, lambda_logit_scale = self.pi_logit_scale.exp(), self.lambda_logit_scale.exp()
+            pi_logit_map = pi_logit_scale * pi_image_feats @ pi_text_feats.t()  # (B, H, W, 2), logits per image
+            lambda_logit_map = lambda_logit_scale * lambda_image_feats @ lambda_text_feats.t()  # (B, H, W, N - 1), logits per image
+            pi_logit_map =  pi_logit_map.permute(0, 3, 1, 2)  # (B, 2, H, W)
+            lambda_logit_map = lambda_logit_map.permute(0, 3, 1, 2)  # (B, N - 1, H, W)
+            lambda_map = (lambda_logit_map.softmax(dim=1) * self.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            # pi_logit_map.softmax(dim=1)[:, 0] is the probability of zeros
+            den_map = pi_logit_map.softmax(dim=1)[:, 1:] * lambda_map # (B, 1, H, W)
+            if self.training:
+                return pi_logit_map, lambda_logit_map, lambda_map, den_map
+            else:
+                return den_map
+        else:
+            image_feats = self.head(image_feats)
+            image_feats = F.normalize(image_feats.permute(0, 2, 3, 1), p=2, dim=-1)
+            text_feats = self.text_feats
+            logit_scale = self.logit_scale.exp()
+            logit_map = logit_scale * image_feats @ text_feats.t()  # (B, H, W, N), logits per image
+            logit_map = logit_map.permute(0, 3, 1, 2)  # (B, N, H, W)
+            den_map = (logit_map.softmax(dim=1) * self.bin_centers).sum(dim=1, keepdim=True)  # (B, 1, H, W)
+            if self.training:
+                return logit_map, den_map
+            else:
+                return den_map
+def _clip_ebc(
+    model_name: str,
+    weight_name: str,
+    block_size: Optional[int] = None,
+    bins: Optional[List[Tuple[float, float]]] = None,
+    bin_centers: Optional[List[float]] = None,
+    zero_inflated: Optional[bool] = True,
+    num_vpt: Optional[int] = None,
+    vpt_drop: Optional[float] = None,
+    input_size: Optional[int] = None,
+    adapter: Optional[bool] = False,
+    adapter_reduction: Optional[int] = None,
+    lora: Optional[bool] = False,
+    lora_rank: Optional[int] = None,
+    lora_alpha: Optional[float] = None,
+    lora_dropout: Optional[float] = None,
+    text_prompts: Optional[List[str]] = None,
+    norm: Optional[str] = "none",
+    act: Optional[str] = "none",
+) -> CLIP_EBC:
+    return CLIP_EBC(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        bins=bins,
+        bin_centers=bin_centers,
+        zero_inflated=zero_inflated,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        input_size=input_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        lora=lora,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        text_prompts=text_prompts,
+        norm=norm,
+        act=act,
+    )

models/clip_ebc/resnet.py ADDED Viewed

	@@ -0,0 +1,236 @@

+from torch import nn, Tensor
+import open_clip
+from peft import get_peft_model, LoraConfig
+from ..utils import ConvRefine, ConvUpsample, ConvAdapter
+from ..utils import _get_norm_layer, _get_activation
+resnet_names_and_weights = {
+    "RN50": ["openai", "yfcc15m", "cc12m"],
+    "RN101": ["openai", "yfcc15m", "cc12m"],
+    "RN50x4": ["openai", "yfcc15m", "cc12m"],
+    "RN50x16": ["openai", "yfcc15m", "cc12m"],
+    "RN50x64": ["openai", "yfcc15m", "cc12m"],
+}
+refiner_channels = {
+    "RN50": 2048,
+    "RN101": 2048,
+    "RN50x4": 2560,
+    "RN50x16": 3072,
+    "RN50x64": 4096,
+}
+refiner_groups = {
+    "RN50": refiner_channels["RN50"] // 512,  # 4
+    "RN101": refiner_channels["RN101"] // 512, # 4
+    "RN50x4": refiner_channels["RN50x4"] // 512, # 5
+    "RN50x16": refiner_channels["RN50x16"] // 512, # 6
+    "RN50x64": refiner_channels["RN50x64"] // 512, # 8
+}
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ResNet, self).__init__()
+        assert model_name in resnet_names_and_weights, f"Model name should be one of {list(resnet_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in resnet_names_and_weights[model_name], f"Pretrained should be one of {resnet_names_and_weights[model_name]}, but got {weight_name}."
+        assert block_size in [32, 16, 8], f"block_size should be one of [32, 16, 8], got {block_size}"
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        self.adapter = adapter
+        if adapter:
+            for param in model.parameters():
+                param.requires_grad = False
+        # Stem
+        self.conv1 = model.conv1
+        self.bn1 = model.bn1
+        self.act1 = model.act1
+        self.conv2 = model.conv2
+        self.bn2 = model.bn2
+        self.act2 = model.act2
+        self.conv3 = model.conv3
+        self.bn3 = model.bn3
+        self.act3 = model.act3
+        self.avgpool = model.avgpool
+        # Stem: reduction = 4
+        # Layers
+        for idx in range(1, 5):
+            setattr(self, f"layer{idx}", getattr(model, f"layer{idx}"))
+            if adapter:
+                setattr(self, f"adapter{idx}", ConvAdapter(
+                    in_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels,
+                    bottleneck_channels=getattr(model, f"layer{idx}")[-1].conv3.out_channels // adapter_reduction,
+                ) if idx < 4 else nn.Identity())  # No adapter for the last layer
+        self.in_features = model.attnpool.c_proj.weight.shape[1]
+        self.out_features = model.attnpool.c_proj.weight.shape[0]
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == 32:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size == 16:
+            self.refiner = ConvUpsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        else:  # block_size == 8
+            self.refiner = nn.Sequential(
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+                ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                ),
+            )
+    def train(self, mode: bool = True):
+        if self.adapter and mode:
+            # training:
+            self.conv1.eval()
+            self.bn1.eval()
+            self.act1.eval()
+            self.conv2.eval()
+            self.bn2.eval()
+            self.act2.eval()
+            self.conv3.eval()
+            self.bn3.eval()
+            self.act3.eval()
+            self.avgpool.eval()
+            for idx in range(1, 5):
+                getattr(self, f"layer{idx}").eval()
+                getattr(self, f"adapter{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def stem(self, x: Tensor) -> Tensor:
+        x = self.act1(self.bn1(self.conv1(x)))
+        x = self.act2(self.bn2(self.conv2(x)))
+        x = self.act3(self.bn3(self.conv3(x)))
+        x = self.avgpool(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.layer1(x)
+        if self.adapter:
+            x = self.adapter1(x)
+        x = self.layer2(x)
+        if self.adapter:
+            x = self.adapter2(x)
+        x = self.layer3(x)
+        if self.adapter:
+            x = self.adapter3(x)
+        x = self.layer4(x)
+        if self.adapter:
+            x = self.adapter4(x)
+        x = self.refiner(x)
+        return x
+def _resnet(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    norm: str = "none",
+    act: str = "none"
+) -> ResNet:
+    assert not (lora and adapter), "Lora and adapter cannot be used together."
+    model = ResNet(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d)):
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze BN layers
+        for name, module in model.named_modules():
+            if isinstance(module, nn.BatchNorm2d) and "refiner" not in name:
+                module.requires_grad_(True)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/clip_ebc/utils.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+from torch import Tensor, nn
+import torch.nn.functional as F
+import open_clip
+from tqdm import tqdm
+import numpy as np
+from typing import Union, Tuple, List
+num_to_word = {
+    "0": "zero", "1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9": "nine",
+    "10": "ten", "11": "eleven", "12": "twelve", "13": "thirteen", "14": "fourteen", "15": "fifteen", "16": "sixteen", "17": "seventeen", "18": "eighteen", "19": "nineteen",
+    "20": "twenty", "21": "twenty-one", "22": "twenty-two", "23": "twenty-three", "24": "twenty-four", "25": "twenty-five", "26": "twenty-six", "27": "twenty-seven", "28": "twenty-eight", "29": "twenty-nine",
+    "30": "thirty", "31": "thirty-one", "32": "thirty-two", "33": "thirty-three", "34": "thirty-four", "35": "thirty-five", "36": "thirty-six", "37": "thirty-seven", "38": "thirty-eight", "39": "thirty-nine",
+    "40": "forty", "41": "forty-one", "42": "forty-two", "43": "forty-three", "44": "forty-four", "45": "forty-five", "46": "forty-six", "47": "forty-seven", "48": "forty-eight", "49": "forty-nine",
+    "50": "fifty", "51": "fifty-one", "52": "fifty-two", "53": "fifty-three", "54": "fifty-four", "55": "fifty-five", "56": "fifty-six", "57": "fifty-seven", "58": "fifty-eight", "59": "fifty-nine",
+    "60": "sixty", "61": "sixty-one", "62": "sixty-two", "63": "sixty-three", "64": "sixty-four", "65": "sixty-five", "66": "sixty-six", "67": "sixty-seven", "68": "sixty-eight", "69": "sixty-nine",
+    "70": "seventy", "71": "seventy-one", "72": "seventy-two", "73": "seventy-three", "74": "seventy-four", "75": "seventy-five", "76": "seventy-six", "77": "seventy-seven", "78": "seventy-eight", "79": "seventy-nine",
+    "80": "eighty", "81": "eighty-one", "82": "eighty-two", "83": "eighty-three", "84": "eighty-four", "85": "eighty-five", "86": "eighty-six", "87": "eighty-seven", "88": "eighty-eight", "89": "eighty-nine",
+    "90": "ninety", "91": "ninety-one", "92": "ninety-two", "93": "ninety-three", "94": "ninety-four", "95": "ninety-five", "96": "ninety-six", "97": "ninety-seven", "98": "ninety-eight", "99": "ninety-nine",
+    "100": "one hundred"
+}
+prefixes = [
+        "",
+        "A photo of", "A block of", "An image of", "A picture of",
+        "There are",
+        "The image contains", "The photo contains", "The picture contains",
+        "The image shows", "The photo shows", "The picture shows",
+    ]
+arabic_numeral = [True, False]
+compares = [
+    "more than", "greater than", "higher than", "larger than", "bigger than", "greater than or equal to",
+    "at least", "no less than", "not less than", "not fewer than", "not lower than", "not smaller than", "not less than or equal to",
+    "over", "above", "beyond", "exceeding", "surpassing",
+]
+suffixes = [
+    "people", "persons", "individuals", "humans", "faces", "heads", "figures", "",
+]
+def num2word(num: Union[int, str]) -> str:
+    """
+    Convert the input number to the corresponding English word. For example, 1 -> "one", 2 -> "two", etc.
+    """
+    num = str(int(num))
+    return num_to_word.get(num, num)
+def format_count(
+    bins: List[Union[float, Tuple[float, float]]],
+) -> List[List[str]]:
+    text_prompts = []
+    for prefix in prefixes:
+        for numeral in arabic_numeral:
+            for compare in compares:
+                for suffix in suffixes:
+                    prompts = []
+                    for bin in bins:
+                        if isinstance(bin, (int, float)):  # count is a single number
+                            count = int(bin)
+                            if count == 0 or count == 1:
+                                count = num2word(count) if not numeral else count
+                                prefix_ = "There is" if prefix == "There are" else prefix
+                                suffix_ = "person" if suffix == "people" else suffix[:-1]
+                                prompt = f"{prefix_} {count} {suffix_}"
+                            else:  # count > 1
+                                count = num2word(count) if not numeral else count
+                                prompt = f"{prefix} {count} {suffix}"
+                        elif bin[1] == float("inf"):  # count is (lower_bound, inf)
+                            count = int(bin[0])
+                            count = num2word(count) if not numeral else count
+                            prompt = f"{prefix} {compare} {count} {suffix}"
+                        else:  # bin is (lower_bound, upper_bound)
+                            left, right = int(bin[0]), int(bin[1])
+                            left, right = num2word(left) if not numeral else left, num2word(right) if not numeral else right
+                            prompt = f"{prefix} between {left} and {right} {suffix}"
+                        # Remove starting and trailing whitespaces
+                        prompt = prompt.strip() + "."
+                        prompts.append(prompt)
+                    text_prompts.append(prompts)
+    return text_prompts
+def encode_text(
+    model_name: str,
+    weight_name: str,
+    text: List[str]
+) -> Tensor:
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+    elif torch.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    text = open_clip.get_tokenizer(model_name)(text).to(device)
+    model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).to(device)
+    model.eval()
+    with torch.no_grad():
+        text_feats = model.encode_text(text)
+        text_feats = F.normalize(text_feats, p=2, dim=-1).detach().cpu()
+    return text_feats
+def optimize_text_prompts(
+    model_name: str,
+    weight_name: str,
+    flat_bins: List[Union[float, Tuple[float, float]]],
+    batch_size: int = 1024,
+) -> List[str]:
+    text_prompts = format_count(flat_bins)
+    # Find the template that has the smallest average similarity of bin prompts.
+    print("Finding the best setup for text prompts...")
+    text_prompts_ = [prompt for prompts in text_prompts for prompt in prompts]  # flatten the list
+    text_feats = []
+    for i in tqdm(range(0, len(text_prompts_), batch_size)):
+        text_feats.append(encode_text(model_name, weight_name, text_prompts_[i: min(i + batch_size, len(text_prompts_))]))
+    text_feats = torch.cat(text_feats, dim=0)
+    sims = []
+    for idx, prompts in enumerate(text_prompts):
+        text_feats_ = text_feats[idx * len(prompts): (idx + 1) * len(prompts)]
+        sim = torch.mm(text_feats_, text_feats_.T)
+        sim = sim[~torch.eye(sim.shape[0], dtype=bool)].mean().item()
+        sims.append(sim)
+    optimal_prompts = text_prompts[np.argmin(sims)]
+    sim = sims[np.argmin(sims)]
+    print(f"Found the best text prompts: {optimal_prompts} (similarity: {sim:.2f})")
+    return optimal_prompts

models/clip_ebc/vit.py ADDED Viewed

	@@ -0,0 +1,372 @@

+import torch
+from torch import nn, Tensor
+import math
+from einops import rearrange
+import open_clip
+from peft import get_peft_model, LoraConfig
+from typing import Optional, Tuple
+from ..utils import interpolate_pos_embed, ViTAdapter
+# from ..utils import TransformerRefine, TransformerDownsample, TransformerUpsample
+from ..utils import ConvRefine, ConvDownsample, ConvUpsample
+from ..utils import _get_norm_layer, _get_activation
+vit_names_and_weights = {
+    "ViT-B-32": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_e16", "laion2b_s34b_b79k",
+        "datacomp_xl_s13b_b90k", "datacomp_m_s128m_b4k", "datacomp_s_s13m_b4k",
+        "commonpool_m_clip_s128m_b4k", "commonpool_m_laion_s128m_b4k", "commonpool_m_image_s128m_b4k", "commonpool_m_text_s128m_b4k", "commonpool_m_basic_s128m_b4k", "commonpool_m_s128m_b4k",
+        "commonpool_s_clip_s13m_b4k", "commonpool_s_laion_s13m_b4k", "commonpool_s_image_s13m_b4k", "commonpool_s_text_s13m_b4k", "commonpool_s_basic_s13m_b4k", "commonpool_s_s13m_b4k",
+    ],
+    "ViT-B_32-256": ["datacomp_s34b_b86k"],
+    "ViT-B-16": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_s34b_b88k",
+        "datacomp_xl_s13b_b90k", "datacomp_l_s1b_b8k",
+        "commonpool_l_clip_s1b_b8k", "commonpool_l_laion_s1b_b8k", "commonpool_l_image_s1b_b8k", "commonpool_l_text_s1b_b8k", "commonpool_l_basic_s1b_b8k", "commonpool_l_s1b_b8k",
+        "dfn2b"
+    ],
+    "ViT-L-14": [
+        "openai",
+        "laion400m_e31", "laion400m_e32", "laion2b_s32b_b82k",
+        "datacomp_xl_s13b_b90k",
+        "commonpool_xl_clip_s13b_b90k", "commonpool_xl_laion_s13b_b90k", "commonpool_xl_s13b_b90k"
+    ],
+    "ViT-L-14-336": ["openai"],
+    "ViT-H-14": ["laion2b_s32b_b79k"],
+    "ViT-g-14": ["laion2b_s12b_b42k", "laion2b_s34b_b88k"],
+    "ViT-bigG-14": ["laion2b_s39b_b160k"],
+}
+refiner_channels = {
+    "ViT-B-32": 768,
+    "ViT-B-32-256": 768,
+    "ViT-B-16": 768,
+    "ViT-L-14": 1024,
+    "ViT-L-14-336": 1024,
+    "ViT-H-14": 1280,
+    "ViT-g-14": 1408,
+    "ViT-bigG-14": 1664,
+}
+refiner_groups = {
+    "ViT-B-32": 1,
+    "ViT-B-32-256": 1,
+    "ViT-B-16": 1,
+    "ViT-L-14": 1,
+    "ViT-L-14-336": 1,
+    "ViT-H-14": 1,
+    "ViT-g-14": refiner_channels["ViT-g-14"] // 704,  # 2
+    "ViT-bigG-14": refiner_channels["ViT-bigG-14"] // 416,  # 4
+}
+class ViT(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        weight_name: str,
+        block_size: int = 16,
+        num_vpt: int = 32,
+        vpt_drop: float = 0.0,
+        adapter: bool = False,
+        adapter_reduction: int = 4,
+        input_size: Optional[Tuple[int, int]] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super(ViT, self).__init__()
+        assert model_name in vit_names_and_weights, f"Model name should be one of {list(vit_names_and_weights.keys())}, but got {model_name}."
+        assert weight_name in vit_names_and_weights[model_name], f"Pretrained should be one of {vit_names_and_weights[model_name]}, but got {weight_name}."
+        if adapter:
+            assert num_vpt is None or num_vpt == 0, "num_vpt should be None or 0 when using adapter."
+            assert vpt_drop is None or vpt_drop == 0.0, "vpt_drop should be None or 0.0 when using adapter."
+        else:
+            assert num_vpt > 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+            assert 0.0 <= vpt_drop < 1.0, f"VPT dropout should be in [0.0, 1.0), but got {vpt_drop}."
+        self.model_name, self.weight_name = model_name, weight_name
+        self.block_size = block_size
+        self.num_vpt = num_vpt
+        self.vpt_drop = vpt_drop
+        self.adapter = adapter
+        model = open_clip.create_model_from_pretrained(model_name, weight_name, return_transform=False).visual
+        # Always freeze the parameters of the model
+        for param in model.parameters():
+            param.requires_grad = False
+        # Setup the model
+        self.input_size = input_size if input_size is not None else model.image_size
+        self.pretrain_size = model.image_size
+        self.patch_size = model.patch_size
+        self.class_embedding = model.class_embedding
+        self.positional_embedding = model.positional_embedding
+        self.embed_dim = model.class_embedding.shape[-1]
+        self.conv1 = model.conv1
+        self.ln_pre = model.ln_pre
+        self.resblocks = model.transformer.resblocks
+        self.num_layers = len(self.resblocks)
+        self.ln_post = model.ln_post
+        # Setup VPT tokens
+        val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
+        for idx in range(self.num_layers):
+            if self.adapter:
+                setattr(self, f"adapter{idx}", ViTAdapter(
+                    in_channels=self.embed_dim,
+                    bottleneck_channels=self.embed_dim // adapter_reduction,
+                ))
+            else:
+                setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
+                nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
+                setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
+        # Adjust the positional embedding to match the new input size
+        self._adjust_pos_embed()
+        in_features, out_features = model.proj.shape
+        self.in_features = in_features
+        self.out_features = out_features
+        patch_size = self.patch_size[0]
+        if patch_size in [16, 32]:
+            assert block_size in [8, 16, 32], f"Patch size is 32, but got block size {block_size}."
+        else:  # patch_size == 14
+            assert block_size in [7, 14, 28], f"Patch size is 14, but got block size {block_size}."
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if block_size == patch_size:
+            self.refiner = ConvRefine(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+        elif block_size < patch_size:  # upsample
+            if block_size == 8 and patch_size == 32:
+                self.refiner = nn.Sequential(
+                    ConvUpsample(
+                        in_channels=self.in_features,
+                        out_channels=self.in_features,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                        groups=refiner_groups[self.model_name],
+                    ),
+                    ConvUpsample(
+                        in_channels=self.in_features,
+                        out_channels=self.in_features,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                        groups=refiner_groups[self.model_name],
+                    ),
+                )
+            else:
+                self.refiner = ConvUpsample(
+                    in_channels=self.in_features,
+                    out_channels=self.in_features,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                    groups=refiner_groups[self.model_name],
+                )
+        else:  # downsample
+            assert block_size // patch_size == 2, f"Block size {block_size} should be 2 times the patch size {patch_size}."
+            self.refiner = ConvDownsample(
+                in_channels=self.in_features,
+                out_channels=self.in_features,
+                norm_layer=norm_layer,
+                activation=activation,
+                groups=refiner_groups[self.model_name],
+            )
+    def _adjust_pos_embed(self) -> Tensor:
+        """
+        Adjust the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        self.positional_embedding = nn.Parameter(self._interpolate_pos_embed(self.pretrain_size[0], self.pretrain_size[1], self.input_size[0], self.input_size[1]), requires_grad=False)
+    def _interpolate_pos_embed(self, orig_h: int, orig_w: int, new_h: int, new_w: int) -> Tensor:
+        """
+        Interpolate the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        if (orig_h, orig_w) == (new_h, new_w):
+            return self.positional_embedding
+        orig_h_patches, orig_w_patches = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        new_h_patches, new_w_patches = new_h // self.patch_size[0], new_w // self.patch_size[1]
+        class_pos_embed, patch_pos_embed = self.positional_embedding[:1, :], self.positional_embedding[1:, :]
+        patch_pos_embed = rearrange(patch_pos_embed, "(h w) d -> d h w", h=orig_h_patches, w=orig_w_patches)
+        patch_pos_embed = interpolate_pos_embed(patch_pos_embed, size=(new_h_patches, new_w_patches))
+        patch_pos_embed = rearrange(patch_pos_embed, "d h w -> (h w) d")
+        pos_embed = torch.cat((class_pos_embed, patch_pos_embed), dim=0)
+        return pos_embed
+    def train(self, mode: bool = True):
+        if mode:
+            # training:
+            self.conv1.eval()
+            self.ln_pre.eval()
+            self.resblocks.eval()
+            self.ln_post.eval()
+            for idx in range(self.num_layers):
+                getattr(self, f"vpt_drop_{idx}").train()
+            self.refiner.train()
+        else:
+            # evaluation:
+            for module in self.children():
+                module.train(mode)
+    def _prepare_vpt(self, layer: int, batch_size: int, device: torch.device) -> Tensor:
+        vpt = getattr(self, f"vpt_{layer}").unsqueeze(0).expand(batch_size, -1, -1).to(device)  # (batch_size, num_vpt, embed_dim)
+        vpt = getattr(self, f"vpt_drop_{layer}")(vpt)
+        return vpt
+    def _forward_patch_embed(self, x: Tensor) -> Tensor:
+        # This step performs 1) embed x into patches; 2) append the class token; 3) add positional embeddings.
+        assert len(x.shape) == 4, f"Expected input to have shape (batch_size, 3, height, width), but got {x.shape}"
+        batch_size, _, height, width = x.shape
+        # Step 1: Embed x into patches
+        x = self.conv1(x)
+        # Step 2: Append the class token
+        class_embedding = self.class_embedding.expand(batch_size, 1, -1)
+        x = rearrange(x, "b d h w -> b (h w) d")
+        x = torch.cat([class_embedding, x], dim=1)
+        # Step 3: Add positional embeddings
+        pos_embed = self._interpolate_pos_embed(orig_h=self.input_size[0], orig_w=self.input_size[1], new_h=height, new_w=width).expand(batch_size, -1, -1)
+        x = x + pos_embed
+        x = self.ln_pre(x)
+        return x
+    def _forward_vpt(self, x: Tensor, idx: int) -> Tensor:
+        batch_size = x.shape[0]
+        device = x.device
+        # Assemble
+        vpt = self._prepare_vpt(idx, batch_size, device)
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            vpt,
+            x[:, 1:, :]  # patches
+        ], dim=1)
+        # Forward
+        x = self.resblocks[idx](x)
+        # Disassemble
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            x[:, 1 + self.num_vpt:, :]  # patches
+        ], dim=1)
+        return x
+    def _forward_adapter(self, x: Tensor, idx: int) -> Tensor:
+        return getattr(self, f"adapter{idx}")(x)
+    def forward_encoder(self, x: Tensor) -> Tensor:
+        x = self._forward_patch_embed(x)
+        for idx in range(self.num_layers):
+            x = self._forward_adapter(x, idx) if self.adapter else self._forward_vpt(x, idx)
+        x = self.ln_post(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        orig_h, orig_w = x.shape[-2:]
+        num_patches_h, num_patches_w = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        x = self.forward_encoder(x)
+        x = x[:, 1:, :]  # remove the class token
+        x = rearrange(x, "b (h w) d -> b d h w", h=num_patches_h, w=num_patches_w)
+        x = self.refiner(x)
+        return x
+def _vit(
+    model_name: str,
+    weight_name: str,
+    block_size: int = 16,
+    num_vpt: int = 32,
+    vpt_drop: float = 0.1,
+    adapter: bool = False,
+    adapter_reduction: int = 4,
+    lora: bool = False,
+    lora_rank: int = 16,
+    lora_alpha: float = 32.0,
+    lora_dropout: float = 0.1,
+    input_size: Optional[Tuple[int, int]] = None,
+    norm: str = "none",
+    act: str = "none"
+) -> ViT:
+    assert not (lora and adapter), "LoRA and adapter cannot be used together."
+    model = ViT(
+        model_name=model_name,
+        weight_name=weight_name,
+        block_size=block_size,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        adapter=adapter,
+        adapter_reduction=adapter_reduction,
+        input_size=input_size,
+        norm=norm,
+        act=act
+    )
+    if lora:
+        target_modules = []
+        for name, module in model.named_modules():
+            if isinstance(module, (nn.Linear, nn.Conv2d, nn.MultiheadAttention)) and "refiner" not in name:
+                target_modules.append(name)
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            bias="none",
+            target_modules=target_modules,
+        )
+        model = get_peft_model(model, lora_config)
+        # Unfreeze refiner
+        for name, module in model.named_modules():
+            if "refiner" in name:
+                module.requires_grad_(True)
+    return model

models/ebc/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import EBC, _ebc
2	+
3	+ __all__ = ["EBC", "_ebc"]

models/ebc/cannet.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from typing import List, Optional
+from .csrnet import _csrnet, _csrnet_bn
+from ..utils import _init_weights
+EPS = 1e-6
+class ContextualModule(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = 512,
+        scales: List[int] = [1, 2, 3, 6],
+    ) -> None:
+        super().__init__()
+        self.scales = scales
+        self.multiscale_modules = nn.ModuleList([self.__make_scale__(in_channels, size) for size in scales])
+        self.bottleneck = nn.Conv2d(in_channels * 2, out_channels, kernel_size=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.weight_net = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.apply(_init_weights)
+    def __make_weight__(self, feature: Tensor, scale_feature: Tensor) -> Tensor:
+        weight_feature = feature - scale_feature
+        weight_feature = self.weight_net(weight_feature)
+        return F.sigmoid(weight_feature)
+    def __make_scale__(self, channels: int, size: int) -> nn.Module:
+        return nn.Sequential(
+            nn.AdaptiveAvgPool2d(output_size=(size, size)),
+            nn.Conv2d(channels, channels, kernel_size=1, bias=False),
+        )
+    def forward(self, feature: Tensor) -> Tensor:
+        h, w = feature.shape[-2:]
+        multiscale_feats = [F.interpolate(input=scale(feature), size=(h, w), mode="bilinear") for scale in self.multiscale_modules]
+        weights = [self.__make_weight__(feature, scale_feature) for scale_feature in multiscale_feats]
+        multiscale_feats = sum([multiscale_feats[i] * weights[i] for i in range(len(weights))]) / (sum(weights) + EPS)
+        overall_features = torch.cat([multiscale_feats, feature], dim=1)
+        overall_features = self.bottleneck(overall_features)
+        overall_features = self.relu(overall_features)
+        return overall_features
+class CANNet(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none",
+        scales: List[int] = [1, 2, 3, 6],
+    ) -> None:
+        super().__init__()
+        assert model_name in ["csrnet", "csrnet_bn"], f"Model name should be one of ['csrnet', 'csrnet_bn'], but got {model_name}."
+        assert block_size is None or block_size in [8, 16, 32], f"block_size should be one of [8, 16, 32], but got {block_size}."
+        assert isinstance(scales, (tuple, list)), f"scales should be a list or tuple, got {type(scales)}."
+        assert len(scales) > 0, f"Expected at least one size, got {len(scales)}."
+        assert all([isinstance(size, int) for size in scales]), f"Expected all size to be int, got {scales}."
+        self.model_name = model_name
+        self.scales = scales
+        csrnet = _csrnet(block_size=block_size, norm=norm, act=act) if model_name == "csrnet" else _csrnet_bn(block_size=block_size, norm=norm, act=act)
+        self.block_size = csrnet.block_size
+        self.encoder = csrnet.encoder
+        self.encoder_channels = csrnet.encoder_channels
+        self.encoder_reduction = csrnet.encoder_reduction  # feature map size compared to input size
+        self.refiner = nn.Sequential(
+            csrnet.refiner,
+            ContextualModule(csrnet.refine_channels, 512, scales)
+        )
+        self.refiner_channels = 512
+        self.refiner_reduction = csrnet.refiner_reduction  # feature map size compared to input size
+        self.decoder = csrnet.decoder
+        self.decoder_channels = csrnet.decoder_channels
+        self.decoder_reduction = csrnet.decoder_reduction
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encoder(x)
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+def _cannet(block_size: Optional[int] = None, norm: str = "none", act: str = "none", scales: List[int] = [1, 2, 3, 6]) -> CANNet:
+    return CANNet("csrnet", block_size=block_size, norm=norm, act=act, scales=scales)
+def _cannet_bn(block_size: Optional[int] = None, norm: str = "none", act: str = "none", scales: List[int] = [1, 2, 3, 6]) -> CANNet:
+    return CANNet("csrnet_bn", block_size=block_size, norm=norm, act=act, scales=scales)

models/ebc/csrnet.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from torch import nn, Tensor
+from torch.hub import load_state_dict_from_url
+from typing import Optional
+from .vgg import VGG
+from .utils import make_vgg_layers, vgg_urls
+from ..utils import _init_weights, ConvDownsample, _get_activation, _get_norm_layer
+EPS = 1e-6
+encoder_cfg = [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512]
+decoder_cfg = [512, 512, 512, 256, 128]
+class CSRNet(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in ["vgg16", "vgg16_bn"], f"Model name should be one of ['vgg16', 'vgg16_bn'], but got {model_name}."
+        assert block_size is None or block_size in [8, 16, 32], f"block_size should be one of [8, 16, 32], but got {block_size}."
+        self.model_name = model_name
+        vgg = VGG(make_vgg_layers(encoder_cfg, in_channels=3, batch_norm="bn" in model_name, dilation=1))
+        vgg.load_state_dict(load_state_dict_from_url(vgg_urls[model_name]), strict=False)
+        self.encoder = vgg.features
+        self.encoder_reduction = 8
+        self.encoder_channels = 512
+        self.block_size = block_size if block_size is not None else 8
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(vgg)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(vgg)
+        if self.block_size == self.encoder_reduction:
+            self.refiner = nn.Identity()
+        elif self.block_size > self.encoder_reduction:
+            if self.block_size == 32:
+                self.refiner = nn.Sequential(
+                    ConvDownsample(
+                        in_channels=self.encoder_channels,
+                        out_channels=self.encoder_channels,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    ),
+                    ConvDownsample(
+                        in_channels=self.encoder_channels,
+                        out_channels=self.encoder_channels,
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    )
+                )
+            elif self.block_size == 16:
+                self.refiner = ConvDownsample(
+                    in_channels=self.encoder_channels,
+                    out_channels=self.encoder_channels,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                )
+        self.refiner_channels = self.encoder_channels
+        self.refiner_reduction = self.block_size
+        decoder = make_vgg_layers(decoder_cfg, in_channels=512, batch_norm="bn" in model_name, dilation=2)
+        decoder.apply(_init_weights)
+        self.decoder = decoder
+        self.decoder_channels = decoder_cfg[-1]
+        self.decoder_reduction = self.refiner_reduction
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encoder(x)
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+def _csrnet(block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> CSRNet:
+    return CSRNet("vgg16", block_size=block_size, norm=norm, act=act)
+def _csrnet_bn(block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> CSRNet:
+    return CSRNet("vgg16_bn", block_size=block_size, norm=norm, act=act)

models/ebc/hrnet.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import timm
+import torch.nn.functional as F
+from torch import nn, Tensor
+from functools import partial
+from typing import Optional
+from ..utils import ConvRefine, _get_norm_layer, _get_activation
+available_hrnets = [
+    "hrnet_w18", "hrnet_w18_small", "hrnet_w18_small_v2",
+    "hrnet_w30", "hrnet_w32", "hrnet_w40", "hrnet_w44", "hrnet_w48", "hrnet_w64",
+]
+class HRNet(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in available_hrnets, f"Model name should be one of {available_hrnets}"
+        assert block_size is None or block_size in [8, 16, 32], f"block_size should be one of [8, 16, 32], but got {block_size}."
+        self.model_name = model_name
+        self.block_size = block_size if block_size is not None else 32
+        model = timm.create_model(model_name, pretrained=True)
+        self.conv1 = model.conv1
+        self.bn1 = model.bn1
+        self.act1 = model.act1
+        self.conv2 = model.conv2
+        self.bn2 = model.bn2
+        self.act2 = model.act2
+        self.layer1 = model.layer1
+        self.transition1 = model.transition1
+        self.stage2 = model.stage2
+        self.transition2 = model.transition2
+        self.stage3 = model.stage3
+        self.transition3 = model.transition3
+        self.stage4 = model.stage4
+        incre_modules = model.incre_modules
+        downsamp_modules = model.downsamp_modules
+        assert len(incre_modules) == 4, f"Expected 4 incre_modules, got {len(self.incre_modules)}"
+        assert len(downsamp_modules) == 3, f"Expected 3 downsamp_modules, got {len(self.downsamp_modules)}"
+        self.out_channels_4 = incre_modules[0][0].downsample[0].out_channels
+        self.out_channels_8 = incre_modules[1][0].downsample[0].out_channels
+        self.out_channels_16 = incre_modules[2][0].downsample[0].out_channels
+        self.out_channels_32 = incre_modules[3][0].downsample[0].out_channels
+        if self.block_size == 8:
+            self.encoder_reduction = 8
+            self.encoder_channels = self.out_channels_8
+            self.incre_modules = incre_modules[:2]
+            self.downsamp_modules = downsamp_modules[:1]
+            self.refiner = nn.Identity()
+            self.refiner_reduction = 8
+            self.refiner_channels = self.out_channels_8
+        elif self.block_size == 16:
+            self.encoder_reduction = 16
+            self.encoder_channels = self.out_channels_16
+            self.incre_modules = incre_modules[:3]
+            self.downsamp_modules = downsamp_modules[:2]
+            self.refiner = nn.Identity()
+            self.refiner_reduction = 16
+            self.refiner_channels = self.out_channels_16
+        else:  # self.block_size == 32
+            self.encoder_reduction = 32
+            self.encoder_channels = self.out_channels_32
+            self.incre_modules = incre_modules
+            self.downsamp_modules = downsamp_modules
+            self.refiner = nn.Identity()
+            self.refiner_reduction = 32
+            self.refiner_channels = self.out_channels_32
+        # define the decoder
+        if self.refiner_channels <= 512:
+            groups = 1
+        elif self.refiner_channels <= 1024:
+            groups = 2
+        elif self.refiner_channels <= 2048:
+            groups = 4
+        else:
+            groups = 8
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        decoder_block = partial(ConvRefine, groups=groups, norm_layer=norm_layer, activation=activation)
+        if self.refiner_channels <= 256:
+            self.decoder = nn.Identity()
+            self.decoder_channels = self.refiner_channels
+        elif self.refiner_channels <= 512:
+            self.decoder = decoder_block(self.refiner_channels, self.refiner_channels // 2)
+            self.decoder_channels = self.refiner_channels // 2
+        elif self.refiner_channels <= 1024:
+            self.decoder = nn.Sequential(
+                decoder_block(self.refiner_channels, self.refiner_channels // 2),
+                decoder_block(self.refiner_channels // 2, self.refiner_channels // 4),
+            )
+            self.decoder_channels = self.refiner_channels // 4
+        else:
+            self.decoder = nn.Sequential(
+                decoder_block(self.refiner_channels, self.refiner_channels // 2),
+                decoder_block(self.refiner_channels // 2, self.refiner_channels // 4),
+                decoder_block(self.refiner_channels // 4, self.refiner_channels // 8),
+            )
+            self.decoder_channels = self.refiner_channels // 8
+        self.decoder_reduction = self.refiner_reduction
+    def _interpolate(self, x: Tensor) -> Tensor:
+        # This method adjust the spatial dimensions of the input tensor so that it can be divided by 32.
+        if x.shape[-1] % 32 != 0 or x.shape[-2] % 32 != 0:
+            new_h = int(round(x.shape[-2] / 32) * 32)
+            new_w = int(round(x.shape[-1] / 32) * 32)
+            return F.interpolate(x, size=(new_h, new_w), mode="bicubic", align_corners=False)
+        return x
+    def encode(self, x: Tensor) -> Tensor:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+        x = self.layer1(x)
+        x = [t(x) for t in self.transition1]
+        x = self.stage2(x)
+        x = [t(x[-1]) if not isinstance(t, nn.Identity) else x[i] for i, t in enumerate(self.transition2)]
+        x = self.stage3(x)
+        x = [t(x[-1]) if not isinstance(t, nn.Identity) else x[i] for i, t in enumerate(self.transition3)]
+        x = self.stage4(x)
+        assert len(x) == 4, f"Expected 4 outputs, got {len(x)}"
+        feats = None
+        for i, incre in enumerate(self.incre_modules):
+            if feats is None:
+                feats = incre(x[i])
+            else:
+                down = self.downsamp_modules[i - 1]  # needed for torchscript module indexing
+                feats = incre(x[i]) + down.forward(feats)
+        return feats
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self._interpolate(x)
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+def _hrnet(model_name: str, block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> HRNet:
+    return HRNet(model_name, block_size, norm, act)

models/ebc/model.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import torch
+from torch import nn, Tensor
+from einops import rearrange
+from typing import Tuple, Union, Dict, Optional, List
+from functools import partial
+from .cannet import _cannet, _cannet_bn
+from .csrnet import _csrnet, _csrnet_bn
+from .vgg import _vgg_encoder_decoder, _vgg_encoder
+from .vit import _vit, supported_vit_backbones
+from .timm_models import _timm_model
+from .timm_models import regular_models as timm_regular_models, heavy_models as timm_heavy_models, light_models as timm_light_models, lighter_models as timm_lighter_models
+from .hrnet import _hrnet, available_hrnets
+from ..utils import conv1x1
+regular_models = [
+    "csrnet", "csrnet_bn",
+    "cannet", "cannet_bn",
+    "vgg11", "vgg11_bn", "vgg13", "vgg13_bn", "vgg16", "vgg16_bn", "vgg19", "vgg19_bn",
+    "vgg11_ae", "vgg11_bn_ae", "vgg13_ae", "vgg13_bn_ae", "vgg16_ae", "vgg16_bn_ae", "vgg19_ae", "vgg19_bn_ae",
+    *timm_regular_models,
+    *available_hrnets,
+]
+heavy_models = timm_heavy_models
+light_models = timm_light_models
+lighter_models = timm_lighter_models
+transformer_models = supported_vit_backbones
+supported_models = regular_models + heavy_models + light_models + lighter_models + transformer_models
+class EBC(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: int,
+        bins: List[Tuple[float, float]],
+        bin_centers: List[float],
+        zero_inflated: bool = True,
+        num_vpt: Optional[int] = None,
+        vpt_drop: Optional[float] = None,
+        input_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in supported_models, f"Model name should be one of {supported_models}, but got {model_name}."
+        self.model_name = model_name
+        if input_size is not None:
+            input_size = (input_size, input_size) if isinstance(input_size, int) else input_size
+            assert len(input_size) == 2 and input_size[0] > 0 and input_size[1] > 0, f"Expected input_size to be a tuple of two positive integers, got {input_size}"
+        self.input_size = input_size
+        assert len(bins) == len(bin_centers), f"Expected bins and bin_centers to have the same length, got {len(bins)} and {len(bin_centers)}"
+        assert len(bins) >= 2, f"Expected at least 2 bins, got {len(bins)}"
+        assert all(len(b) == 2 for b in bins), f"Expected bins to be a list of tuples of length 2, got {bins}"
+        bins = [(float(b[0]), float(b[1])) for b in bins]
+        assert all(bin[0] <= p <= bin[1] for bin, p in zip(bins, bin_centers)), f"Expected bin_centers to be within the range of the corresponding bin, got {bins} and {bin_centers}"
+        self.block_size = block_size
+        self.bins = bins
+        self.register_buffer("bin_centers", torch.tensor(bin_centers, dtype=torch.float32, requires_grad=False).view(1, -1, 1, 1))
+        self.zero_inflated = zero_inflated
+        self.num_vpt = num_vpt
+        self.vpt_drop = vpt_drop
+        self.input_size = input_size
+        self.norm = norm
+        self.act = act
+        self._build_backbone()
+        self._build_head()
+    def _build_backbone(self) -> None:
+        model_name = self.model_name
+        if model_name == "csrnet":
+            self.backbone = _csrnet(self.block_size, self.norm, self.act)
+        elif model_name == "csrnet_bn":
+            self.backbone = _csrnet_bn(self.block_size, self.norm, self.act)
+        elif model_name == "cannet":
+            self.backbone = _cannet(self.block_size, self.norm, self.act)
+        elif model_name == "cannet_bn":
+            self.backbone = _cannet_bn(self.block_size, self.norm, self.act)
+        elif model_name == "vgg11":
+            self.backbone = _vgg_encoder("vgg11", self.block_size, self.norm, self.act)
+        elif model_name == "vgg11_ae":
+            self.backbone = _vgg_encoder_decoder("vgg11", self.block_size, self.norm, self.act)
+        elif model_name == "vgg11_bn":
+            self.backbone = _vgg_encoder("vgg11_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg11_bn_ae":
+            self.backbone = _vgg_encoder_decoder("vgg11_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg13":
+            self.backbone = _vgg_encoder("vgg13", self.block_size, self.norm, self.act)
+        elif model_name == "vgg13_ae":
+            self.backbone = _vgg_encoder_decoder("vgg13", self.block_size, self.norm, self.act)
+        elif model_name == "vgg13_bn":
+            self.backbone = _vgg_encoder("vgg13_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg13_bn_ae":
+            self.backbone = _vgg_encoder_decoder("vgg13_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg16":
+            self.backbone = _vgg_encoder("vgg16", self.block_size, self.norm, self.act)
+        elif model_name == "vgg16_ae":
+            self.backbone = _vgg_encoder_decoder("vgg16", self.block_size, self.norm, self.act)
+        elif model_name == "vgg16_bn":
+            self.backbone = _vgg_encoder("vgg16_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg16_bn_ae":
+            self.backbone = _vgg_encoder_decoder("vgg16_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg19":
+            self.backbone = _vgg_encoder("vgg19", self.block_size, self.norm, self.act)
+        elif model_name == "vgg19_ae":
+            self.backbone = _vgg_encoder_decoder("vgg19", self.block_size, self.norm, self.act)
+        elif model_name == "vgg19_bn":
+            self.backbone = _vgg_encoder("vgg19_bn", self.block_size, self.norm, self.act)
+        elif model_name == "vgg19_bn_ae":
+            self.backbone = _vgg_encoder_decoder("vgg19_bn", self.block_size, self.norm, self.act)
+        elif model_name in supported_vit_backbones:
+            self.backbone = _vit(model_name, block_size=self.block_size, num_vpt=self.num_vpt, vpt_drop=self.vpt_drop, input_size=self.input_size, norm=self.norm, act=self.act)
+        elif model_name in available_hrnets:
+            self.backbone = _hrnet(model_name, block_size=self.block_size, norm=self.norm, act=self.act)
+        else:
+            self.backbone = _timm_model(model_name, self.block_size, self.norm, self.act)
+    def _build_head(self) -> None:
+        channels = self.backbone.decoder_channels
+        if self.zero_inflated:
+            self.bin_head = conv1x1(
+                in_channels=channels,
+                out_channels=len(self.bins) - 1,
+            )
+            self.pi_head = conv1x1(
+                in_channels=channels,
+                out_channels=2,
+            )  # this models structural 0s.
+        else:
+            self.bin_head = conv1x1(
+                in_channels=channels,
+                out_channels=len(self.bins),
+            )
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        x = self.backbone(x)
+        if self.zero_inflated:
+            logit_pi_maps = self.pi_head(x)  # shape: (B, 2, H, W)
+            logit_maps = self.bin_head(x)  # shape: (B, C, H, W)
+            lambda_maps = (logit_maps.softmax(dim=1) * self.bin_centers[:, 1:]).sum(dim=1, keepdim=True)  # shape: (B, 1, H, W)
+            # logit_pi_maps.softmax(dim=1)[:, 0] is the probability of zeros
+            den_maps = logit_pi_maps.softmax(dim=1)[:, 1:] * lambda_maps  # expectation of the Poisson distribution
+            if self.training:
+                return logit_pi_maps, logit_maps, lambda_maps, den_maps
+            else:
+                return den_maps
+        else:
+            logit_maps = self.bin_head(x)
+            den_maps = (logit_maps.softmax(dim=1) * self.bin_centers).sum(dim=1, keepdim=True)
+            if self.training:
+                return logit_maps, den_maps
+            else:
+                return den_maps
+def _ebc(
+    model_name: str,
+    block_size: int,
+    bins: List[Tuple[float, float]],
+    bin_centers: List[float],
+    zero_inflated: bool = True,
+    num_vpt: Optional[int] = None,
+    vpt_drop: Optional[float] = None,
+    input_size: Optional[int] = None,
+    norm: str = "none",
+    act: str = "none"
+) -> EBC:
+    return EBC(
+        model_name=model_name,
+        block_size=block_size,
+        bins=bins,
+        bin_centers=bin_centers,
+        zero_inflated=zero_inflated,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        input_size=input_size,
+        norm=norm,
+        act=act
+    )

models/ebc/timm_models.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from timm import create_model
+from torch import nn, Tensor
+from typing import Optional
+from functools import partial
+from ..utils import _get_activation, _get_norm_layer, ConvUpsample, ConvDownsample
+from ..utils import LightConvUpsample, LightConvDownsample, LighterConvUpsample, LighterConvDownsample
+from ..utils import ConvRefine, LightConvRefine, LighterConvRefine
+regular_models = [
+    "resnet18", "resnet34", "resnet50", "resnet101", "resnet152",
+    "convnext_nano", "convnext_tiny", "convnext_small", "convnext_base",
+    "mobilenetv4_conv_large",
+]
+heavy_models = [
+    "convnext_large", "convnext_xlarge", "convnext_xxlarge",
+]
+light_models = [
+    "mobilenetv1_100", "mobilenetv1_125",
+    "mobilenetv2_100", "mobilenetv2_140",
+    "mobilenetv3_large_100",
+    "mobilenetv4_conv_medium",
+]
+lighter_models = [
+    "mobilenetv2_050",
+    "mobilenetv3_small_050", "mobilenetv3_small_075", "mobilenetv3_small_100",
+    "mobilenetv4_conv_small_050", "mobilenetv4_conv_small"
+]
+supported_models = regular_models + heavy_models + light_models + lighter_models
+refiner_in_channels = {
+    # ResNet
+    "resnet18": 512,
+    "resnet34": 512,
+    "resnet50": 2048,
+    "resnet101": 2048,
+    "resnet152": 2048,
+    # ConvNeXt
+    "convnext_nano": 640,
+    "convnext_tiny": 768,
+    "convnext_small": 768,
+    "convnext_base": 1024,
+    "convnext_large": 1536,
+    "convnext_xlarge": 2048,
+    "convnext_xxlarge": 3072,
+    # MobileNet V1
+    "mobilenetv1_100": 1024,
+    "mobilenetv1_125": 1280,
+    # MobileNet V2
+    "mobilenetv2_050": 160,
+    "mobilenetv2_100": 320,
+    "mobilenetv2_140": 448,
+    # MobileNet V3
+    "mobilenetv3_small_050": 288,
+    "mobilenetv3_small_075": 432,
+    "mobilenetv3_small_100": 576,
+    "mobilenetv3_large_100": 960,
+    # MobileNet V4
+    "mobilenetv4_conv_small_050": 480,
+    "mobilenetv4_conv_small": 960,
+    "mobilenetv4_conv_medium": 960,
+    "mobilenetv4_conv_large": 960,
+}
+refiner_out_channels = {
+    # ResNet
+    "resnet18": 512,
+    "resnet34": 512,
+    "resnet50": 2048,
+    "resnet101": 2048,
+    "resnet152": 2048,
+    # ConvNeXt
+    "convnext_nano": 640,
+    "convnext_tiny": 768,
+    "convnext_small": 768,
+    "convnext_base": 1024,
+    "convnext_large": 1536,
+    "convnext_xlarge": 2048,
+    "convnext_xxlarge": 3072,
+    # MobileNet V1
+    "mobilenetv1_100": 512,
+    "mobilenetv1_125": 640,
+    # MobileNet V2
+    "mobilenetv2_050": 160,
+    "mobilenetv2_100": 320,
+    "mobilenetv2_140": 448,
+    # MobileNet V3
+    "mobilenetv3_small_050": 288,
+    "mobilenetv3_small_075": 432,
+    "mobilenetv3_small_100": 576,
+    "mobilenetv3_large_100": 480,
+    # MobileNet V4
+    "mobilenetv4_conv_small_050": 480,
+    "mobilenetv4_conv_small": 960,
+    "mobilenetv4_conv_medium": 960,
+    "mobilenetv4_conv_large": 960,
+}
+groups = {
+    # ResNet
+    "resnet18": 1,
+    "resnet34": 1,
+    "resnet50": refiner_in_channels["resnet50"] // 512,
+    "resnet101": refiner_in_channels["resnet101"] // 512,
+    "resnet152": refiner_in_channels["resnet152"] // 512,
+    # ConvNeXt
+    "convnext_nano": 8,
+    "convnext_tiny": 8,
+    "convnext_small": 8,
+    "convnext_base": 8,
+    "convnext_large": refiner_in_channels["convnext_large"] // 512,
+    "convnext_xlarge": refiner_in_channels["convnext_xlarge"] // 512,
+    "convnext_xxlarge": refiner_in_channels["convnext_xxlarge"] // 512,
+    # MobileNet V1
+    "mobilenetv1_100": None,
+    "mobilenetv1_125": None,
+    # MobileNet V2
+    "mobilenetv2_050": None,
+    "mobilenetv2_100": None,
+    "mobilenetv2_140": None,
+    # MobileNet V3
+    "mobilenetv3_small_050": None,
+    "mobilenetv3_small_075": None,
+    "mobilenetv3_small_100": None,
+    "mobilenetv3_large_100": None,
+    # MobileNet V4
+    "mobilenetv4_conv_small_050": None,
+    "mobilenetv4_conv_small": None,
+    "mobilenetv4_conv_medium": None,
+    "mobilenetv4_conv_large": 1,
+}
+class TIMMModel(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in supported_models, f"Backbone {model_name} not supported. Supported models are {supported_models}"
+        assert block_size is None or block_size in [8, 16, 32], f"Block size should be one of [8, 16, 32], but got {block_size}."
+        self.model_name = model_name
+        self.encoder = create_model(model_name, pretrained=True, features_only=True, out_indices=[-1])
+        self.encoder_channels = self.encoder.feature_info.channels()[-1]
+        self.encoder_reduction = self.encoder.feature_info.reduction()[-1]
+        self.block_size = block_size if block_size is not None else self.encoder_reduction
+        if model_name in lighter_models:
+            upsample_block = LighterConvUpsample
+            downsample_block = LighterConvDownsample
+            decoder_block = LighterConvRefine
+        elif model_name in light_models:
+            upsample_block = LightConvUpsample
+            downsample_block = LightConvDownsample
+            decoder_block = LightConvRefine
+        else:
+            upsample_block = partial(ConvUpsample, groups=groups[model_name])
+            downsample_block = partial(ConvDownsample, groups=groups[model_name])
+            decoder_block = partial(ConvRefine, groups=groups[model_name])
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(self.encoder)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(self.encoder)
+        if self.block_size > self.encoder_reduction:
+            if self.block_size > self.encoder_reduction * 2:
+                assert self.block_size == self.encoder_reduction * 4, f"Block size {self.block_size} is not supported for model {self.model_name}. Supported block sizes are {self.encoder_reduction}, {self.encoder_reduction * 2}, and {self.encoder_reduction * 4}."
+                self.refiner = nn.Sequential(
+                    downsample_block(
+                        in_channels=self.encoder_channels,
+                        out_channels=refiner_in_channels[self.model_name],
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    ),
+                    downsample_block(
+                        in_channels=refiner_in_channels[self.model_name],
+                        out_channels=refiner_out_channels[self.model_name],
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    )
+                )
+            else:
+                assert self.block_size == self.encoder_reduction * 2, f"Block size {self.block_size} is not supported for model {self.model_name}. Supported block sizes are {self.encoder_reduction}, {self.encoder_reduction * 2}, and {self.encoder_reduction * 4}."
+                self.refiner = downsample_block(
+                    in_channels=self.encoder_channels,
+                    out_channels=refiner_out_channels[self.model_name],
+                    norm_layer=norm_layer,
+                    activation=activation,
+                )
+            self.refiner_channels = refiner_out_channels[self.model_name]
+        elif self.block_size < self.encoder_reduction:
+            if self.block_size < self.encoder_reduction // 2:
+                assert self.block_size == self.encoder_reduction // 4, f"Block size {self.block_size} is not supported for model {self.model_name}. Supported block sizes are {self.encoder_reduction}, {self.encoder_reduction // 2}, and {self.encoder_reduction // 4}."
+                self.refiner = nn.Sequential(
+                    upsample_block(
+                        in_channels=self.encoder_channels,
+                        out_channels=refiner_in_channels[self.model_name],
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    ),
+                    upsample_block(
+                        in_channels=refiner_in_channels[self.model_name],
+                        out_channels=refiner_out_channels[self.model_name],
+                        norm_layer=norm_layer,
+                        activation=activation,
+                    )
+                )
+            else:
+                assert self.block_size == self.encoder_reduction // 2, f"Block size {self.block_size} is not supported for model {self.model_name}. Supported block sizes are {self.encoder_reduction}, {self.encoder_reduction // 2}, and {self.encoder_reduction // 4}."
+                self.refiner = upsample_block(
+                    in_channels=self.encoder_channels,
+                    out_channels=refiner_out_channels[self.model_name],
+                    norm_layer=norm_layer,
+                    activation=activation,
+                )
+            self.refiner_channels = refiner_out_channels[self.model_name]
+        else:
+            self.refiner = nn.Identity()
+            self.refiner_channels = self.encoder_channels
+        self.refiner_reduction = self.block_size
+        if self.refiner_channels <= 256:
+            self.decoder = nn.Identity()
+            self.decoder_channels = self.refiner_channels
+        elif self.refiner_channels <= 512:
+            self.decoder = decoder_block(
+                in_channels=self.refiner_channels,
+                out_channels=self.refiner_channels // 2,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+            self.decoder_channels = self.refiner_channels // 2
+        elif self.refiner_channels <= 1024:
+            self.decoder = nn.Sequential(
+                decoder_block(
+                    in_channels=self.refiner_channels,
+                    out_channels=self.refiner_channels // 2,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                ),
+                decoder_block(
+                    in_channels=self.refiner_channels // 2,
+                    out_channels=self.refiner_channels // 4,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                ),
+            )
+            self.decoder_channels = self.refiner_channels // 4
+        else:
+            self.decoder = nn.Sequential(
+                decoder_block(
+                    in_channels=self.refiner_channels,
+                    out_channels=self.refiner_channels // 2,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                ),
+                decoder_block(
+                    in_channels=self.refiner_channels // 2,
+                    out_channels=self.refiner_channels // 4,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                ),
+                decoder_block(
+                    in_channels=self.refiner_channels // 4,
+                    out_channels=self.refiner_channels // 8,
+                    norm_layer=norm_layer,
+                    activation=activation,
+                ),
+            )
+            self.decoder_channels = self.refiner_channels // 8
+        self.decoder_reduction = self.refiner_reduction
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encoder(x)[0]
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+def _timm_model(model_name: str, block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> TIMMModel:
+    return TIMMModel(model_name, block_size=block_size, norm=norm, act=act)

models/ebc/utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from torch import nn
+from typing import  Union, List, List
+vgg_urls = {
+    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
+    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
+    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
+    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth",
+}
+vgg_cfgs = {
+    "A": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512],
+    "B": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512],
+    "D": [64, 64, "M", 128, 128, "M", 256, 256, 256, "M", 512, 512, 512, "M", 512, 512, 512],
+    "E": [64, 64, "M", 128, 128, "M", 256, 256, 256, 256, "M", 512, 512, 512, 512, "M", 512, 512, 512, 512]
+}
+def make_vgg_layers(cfg: List[Union[str, int]], in_channels: int = 3, batch_norm: bool = False, dilation: int = 1) -> nn.Sequential:
+    layers = []
+    for v in cfg:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=dilation, dilation=dilation)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)

models/ebc/vgg.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from torch import nn, Tensor
+from torch.hub import load_state_dict_from_url
+from typing import Optional
+from .utils import make_vgg_layers, vgg_cfgs, vgg_urls
+from ..utils import _init_weights, _get_norm_layer, _get_activation
+from ..utils import  ConvDownsample, ConvUpsample
+vgg_models = [
+    "vgg11", "vgg11_bn",
+    "vgg13", "vgg13_bn",
+    "vgg16", "vgg16_bn",
+    "vgg19", "vgg19_bn",
+]
+decoder_cfg = [512, 256, 128]
+class VGGEncoder(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none",
+    ) -> None:
+        super().__init__()
+        assert model_name in vgg_models, f"Model name should be one of {vgg_models}, but got {model_name}."
+        assert block_size is None or block_size in [8, 16, 32], f"Block size should be one of [8, 16, 32], but got {block_size}."
+        self.model_name = model_name
+        if model_name == "vgg11":
+            self.encoder = vgg11()
+        elif model_name == "vgg11_bn":
+            self.encoder = vgg11_bn()
+        elif model_name == "vgg13":
+            self.encoder = vgg13()
+        elif model_name == "vgg13_bn":
+            self.encoder = vgg13_bn()
+        elif model_name == "vgg16":
+            self.encoder = vgg16()
+        elif model_name == "vgg16_bn":
+            self.encoder = vgg16_bn()
+        elif model_name == "vgg19":
+            self.encoder = vgg19()
+        else:  # model_name == "vgg19_bn"
+            self.encoder = vgg19_bn()
+        self.encoder_channels = 512
+        self.encoder_reduction = 16
+        self.block_size = block_size if block_size is not None else self.encoder_reduction
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(self.encoder)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(self.encoder)
+        if self.encoder_reduction >= self.block_size:  # 8, 16
+            self.refiner = ConvUpsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                scale_factor=self.encoder_reduction // self.block_size,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        else: # 32
+            self.refiner = ConvDownsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        self.refiner_channels = self.encoder_channels
+        self.refiner_reduction = self.block_size
+        self.decoder = nn.Identity()
+        self.decoder_channels = self.encoder_channels
+        self.decoder_reduction = self.refiner_reduction
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encoder(x)
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+class VGGEncoderDecoder(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        norm: str = "none",
+        act: str = "none",
+    ) -> None:
+        super().__init__()
+        assert model_name in vgg_models, f"Model name should be one of {vgg_models}, but got {model_name}."
+        assert block_size is None or block_size in [8, 16, 32], f"Block size should be one of [8, 16, 32], but got {block_size}."
+        self.model_name = model_name
+        if model_name == "vgg11":
+            encoder = vgg11()
+        elif model_name == "vgg11_bn":
+            encoder = vgg11_bn()
+        elif model_name == "vgg13":
+            encoder = vgg13()
+        elif model_name == "vgg13_bn":
+            encoder = vgg13_bn()
+        elif model_name == "vgg16":
+            encoder = vgg16()
+        elif model_name == "vgg16_bn":
+            encoder = vgg16_bn()
+        elif model_name == "vgg19":
+            encoder = vgg19()
+        else:  # model_name == "vgg19_bn"
+            encoder = vgg19_bn()
+        encoder_channels = 512
+        encoder_reduction = 16
+        decoder = make_vgg_layers(decoder_cfg, in_channels=encoder_channels, batch_norm="bn" in model_name, dilation=1)
+        decoder.apply(_init_weights)
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(encoder)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(encoder)
+        self.encoder = nn.Sequential(encoder, decoder)
+        self.encoder_channels = decoder_cfg[-1]
+        self.encoder_reduction = encoder_reduction
+        self.block_size = block_size if block_size is not None else self.encoder_reduction
+        if self.encoder_reduction >= self.block_size:
+            self.refiner = ConvUpsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                scale_factor=self.encoder_reduction // self.block_size,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        else:
+            self.refiner = ConvDownsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        self.refiner_channels = self.encoder_channels
+        self.refiner_reduction = self.block_size
+        self.decoder = nn.Identity()
+        self.decoder_channels = self.refiner_channels
+        self.decoder_reduction = self.refiner_reduction
+    def encode(self, x: Tensor) -> Tensor:
+        return self.encoder(x)
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+class VGG(nn.Module):
+    def __init__(
+        self,
+        features: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.features = features
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        return x
+def vgg11() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["A"]))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg11"]), strict=False)
+    return model
+def vgg11_bn() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["A"], batch_norm=True))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg11_bn"]), strict=False)
+    return model
+def vgg13() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["B"]))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg13"]), strict=False)
+    return model
+def vgg13_bn() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["B"], batch_norm=True))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg13_bn"]), strict=False)
+    return model
+def vgg16() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["D"]))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg16"]), strict=False)
+    return model
+def vgg16_bn() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["D"], batch_norm=True))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg16_bn"]), strict=False)
+    return model
+def vgg19() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["E"]))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg19"]), strict=False)
+    return model
+def vgg19_bn() -> VGG:
+    model = VGG(make_vgg_layers(vgg_cfgs["E"], batch_norm=True))
+    model.load_state_dict(state_dict=load_state_dict_from_url(vgg_urls["vgg19_bn"]), strict=False)
+    return model
+def _vgg_encoder(model_name: str, block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> VGGEncoder:
+    return VGGEncoder(model_name, block_size, norm=norm, act=act)
+def _vgg_encoder_decoder(model_name: str, block_size: Optional[int] = None, norm: str = "none", act: str = "none") -> VGGEncoderDecoder:
+    return VGGEncoderDecoder(model_name, block_size, norm=norm, act=act)

models/ebc/vit.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import torch
+from torch import nn, Tensor
+import timm
+from einops import rearrange
+import torch.nn.functional as F
+import math
+from typing import Optional, Tuple
+from ..utils import ConvUpsample, ConvDownsample, _get_activation, _get_norm_layer, ConvRefine
+supported_vit_backbones = [
+    # Tiny
+    "vit_tiny_patch16_224", "vit_tiny_patch16_384",
+    # Small
+    "vit_small_patch8_224",
+    "vit_small_patch16_224", "vit_small_patch16_384",
+    "vit_small_patch32_224", "vit_small_patch32_384",
+    # Base
+    "vit_base_patch8_224",
+    "vit_base_patch16_224", "vit_base_patch16_384",
+    "vit_base_patch32_224", "vit_base_patch32_384",
+    # Large
+    "vit_large_patch16_224", "vit_large_patch16_384",
+    "vit_large_patch32_224", "vit_large_patch32_384",
+    # Huge
+    "vit_huge_patch14_224",
+]
+refiner_channels = {
+    "vit_tiny_patch16_224": 192,
+    "vit_tiny_patch16_384": 192,
+    "vit_small_patch8_224": 384,
+    "vit_small_patch16_224": 384,
+    "vit_small_patch16_384": 384,
+    "vit_small_patch32_224": 384,
+    "vit_small_patch32_384": 384,
+    "vit_base_patch8_224": 768,
+    "vit_base_patch16_224": 768,
+    "vit_base_patch16_384": 768,
+    "vit_base_patch32_224": 768,
+    "vit_base_patch32_384": 768,
+    "vit_large_patch16_224": 1024,
+    "vit_large_patch16_384": 1024,
+    "vit_large_patch32_224": 1024,
+    "vit_large_patch32_384": 1024,
+}
+refiner_groups = {
+    "vit_tiny_patch16_224": 1,
+    "vit_tiny_patch16_384": 1,
+    "vit_small_patch8_224": 1,
+    "vit_small_patch16_224": 1,
+    "vit_small_patch16_384": 1,
+    "vit_small_patch32_224": 1,
+    "vit_small_patch32_384": 1,
+    "vit_base_patch8_224": 1,
+    "vit_base_patch16_224": 1,
+    "vit_base_patch16_384": 1,
+    "vit_base_patch32_224": 1,
+    "vit_base_patch32_384": 1,
+    "vit_large_patch16_224": 1,
+    "vit_large_patch16_384": 1,
+    "vit_large_patch32_224": 1,
+    "vit_large_patch32_384": 1,
+}
+class ViT(nn.Module):
+    def __init__(
+        self,
+        model_name: str,
+        block_size: Optional[int] = None,
+        num_vpt: int = 32,
+        vpt_drop: float = 0.0,
+        input_size: Optional[Tuple[int, int]] = None,
+        norm: str = "none",
+        act: str = "none"
+    ) -> None:
+        super().__init__()
+        assert model_name in supported_vit_backbones, f"Model {model_name} not supported"
+        assert num_vpt >= 0, f"Number of VPT tokens should be greater than 0, but got {num_vpt}."
+        self.model_name = model_name
+        self.num_vpt = num_vpt
+        self.vpt_drop = vpt_drop
+        model = timm.create_model(model_name, pretrained=True)
+        self.input_size = input_size if input_size is not None else model.patch_embed.img_size
+        self.pretrain_size = model.patch_embed.img_size
+        self.patch_size = model.patch_embed.patch_size
+        if self.patch_size[0] in [8, 16, 32]:
+            assert block_size is None or block_size in [8, 16, 32], f"Block size should be one of [8, 16, 32], but got {block_size}."
+        else:  # patch_size == 14
+            assert block_size is None or block_size in [7, 14, 28], f"Block size should be one of [7, 14, 28], but got {block_size}."
+        self.num_layers = len(model.blocks)
+        self.embed_dim = model.cls_token.shape[-1]
+        if self.num_vpt > 0:  # Use visual prompt tuning so freeze the backbone
+            for param in model.parameters():
+                param.requires_grad = False
+            # Setup VPT tokens
+            val = math.sqrt(6. / float(3 * self.patch_size[0] + self.embed_dim))
+            for idx in range(self.num_layers):
+                setattr(self, f"vpt_{idx}", nn.Parameter(torch.empty(self.num_vpt, self.embed_dim)))
+                nn.init.uniform_(getattr(self, f"vpt_{idx}"), -val, val)
+                setattr(self, f"vpt_drop_{idx}", nn.Dropout(self.vpt_drop))
+        self.patch_embed = model.patch_embed
+        self.cls_token = model.cls_token
+        self.pos_embed = model.pos_embed
+        self.pos_drop = model.pos_drop
+        self.patch_drop = model.patch_drop
+        self.norm_pre = model.norm_pre
+        self.blocks = model.blocks
+        self.norm = model.norm
+        self.encoder_channels = self.embed_dim
+        self.encoder_reduction = self.patch_size[0]
+        self.block_size = block_size if block_size is not None else self.encoder_reduction
+        if norm == "bn":
+            norm_layer = nn.BatchNorm2d
+        elif norm == "ln":
+            norm_layer = nn.LayerNorm
+        else:
+            norm_layer = _get_norm_layer(model)
+        if act == "relu":
+            activation = nn.ReLU(inplace=True)
+        elif act == "gelu":
+            activation = nn.GELU()
+        else:
+            activation = _get_activation(model)
+        if self.block_size < self.encoder_reduction:
+            assert self.block_size == self.encoder_reduction // 2, f"Block size should be half of the encoder reduction, but got {self.block_size} and {self.encoder_reduction}."
+            self.refiner = ConvUpsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        elif self.block_size > self.encoder_reduction:
+            assert self.block_size == self.encoder_reduction * 2, f"Block size should be double of the encoder reduction, but got {self.block_size} and {self.encoder_reduction}."
+            self.refiner = ConvDownsample(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        else:
+            self.refiner = ConvRefine(
+                in_channels=self.encoder_channels,
+                out_channels=self.encoder_channels,
+                norm_layer=norm_layer,
+                activation=activation,
+            )
+        self.refiner_channels = self.encoder_channels
+        self.refiner_reduction = self.block_size
+        self.decoder = nn.Identity()
+        self.decoder_channels = self.refiner_channels
+        self.reduction = self.refiner_reduction
+        # Adjust the positional embedding to match the new input size
+        self._adjust_pos_embed()
+    def _adjust_pos_embed(self) -> Tensor:
+        """
+        Adjust the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        self.pos_embed = nn.Parameter(self._interpolate_pos_embed(self.pretrain_size[0], self.pretrain_size[1], self.input_size[0], self.input_size[1]), requires_grad=self.num_vpt == 0)
+    def _interpolate_pos_embed(self, orig_h: int, orig_w: int, new_h: int, new_w: int) -> Tensor:
+        """
+        Interpolate the positional embedding to match the spatial resolution of the feature map.
+        Args:
+            orig_h, orig_w: The original spatial resolution of the image.
+            new_h, new_w: The new spatial resolution of the image.
+        """
+        if (orig_h, orig_w) == (new_h, new_w):
+            return self.pos_embed  # (1, (h * w + 1), d)
+        orig_h_patches, orig_w_patches = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        new_h_patches, new_w_patches = new_h // self.patch_size[0], new_w // self.patch_size[1]
+        class_pos_embed, patch_pos_embed = self.pos_embed[:, :1, :], self.pos_embed[:, 1:, :]
+        patch_pos_embed = rearrange(patch_pos_embed, "1 (h w) d -> 1 d h w", h=orig_h_patches, w=orig_w_patches)
+        patch_pos_embed = F.interpolate(patch_pos_embed, size=(new_h_patches, new_w_patches), mode="bicubic", antialias=True)
+        patch_pos_embed = rearrange(patch_pos_embed, "1 d h w -> 1 (h w) d")
+        pos_embed = torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+        return pos_embed
+    def train(self, mode: bool = True):
+        if self.num_vpt > 0 and mode:
+            self.patch_embed.eval()
+            self.pos_drop.eval()
+            self.patch_drop.eval()
+            self.norm_pre.eval()
+            self.blocks.eval()
+            self.norm.eval()
+            for idx in range(self.num_layers):
+                getattr(self, f"vpt_drop_{idx}").train()
+            self.refiner.train()
+            self.decoder.train()
+        else:
+            for module in self.children():
+                module.train(mode)
+    def _prepare_vpt(self, layer: int, batch_size: int, device: torch.device) -> Tensor:
+        vpt = getattr(self, f"vpt_{layer}").unsqueeze(0).expand(batch_size, -1, -1).to(device)  # (batch_size, num_vpt, embed_dim)
+        vpt = getattr(self, f"vpt_drop_{layer}")(vpt)
+        return vpt
+    def _forward_patch_embed(self, x: Tensor) -> Tensor:
+        # This step performs 1) embed x into patches; 2) append the class token; 3) add positional embeddings.
+        assert len(x.shape) == 4, f"Expected input to have shape (batch_size, 3, height, width), but got {x.shape}"
+        batch_size, _, height, width = x.shape
+        # Step 1: Embed x into patches
+        x = self.patch_embed(x)  # (b, h * w, d)
+        # Step 2: Append the class token
+        cls_token = self.cls_token.expand(batch_size, 1, -1)
+        x = torch.cat([cls_token, x], dim=1)
+        # Step 3: Add positional embeddings
+        pos_embed = self._interpolate_pos_embed(orig_h=self.input_size[0], orig_w=self.input_size[1], new_h=height, new_w=width).expand(batch_size, -1, -1)
+        x = self.pos_drop(x + pos_embed)
+        return x
+    def _forward_vpt(self, x: Tensor, idx: int) -> Tensor:
+        batch_size = x.shape[0]
+        device = x.device
+        # Assemble
+        vpt = self._prepare_vpt(idx, batch_size, device)
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            vpt,
+            x[:, 1:, :]  # patches
+        ], dim=1)
+        # Forward
+        x = self.blocks[idx](x)
+        # Disassemble
+        x = torch.cat([
+            x[:, :1, :],  # class token
+            x[:, 1 + self.num_vpt:, :]  # patches
+        ], dim=1)
+        return x
+    def _forward(self, x: Tensor, idx: int) -> Tensor:
+        x = self.blocks[idx](x)
+        return x
+    def encode(self, x: Tensor) -> Tensor:
+        orig_h, orig_w = x.shape[-2:]
+        num_patches_h, num_patches_w = orig_h // self.patch_size[0], orig_w // self.patch_size[1]
+        x = self._forward_patch_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for idx in range(self.num_layers):
+            x = self._forward_vpt(x, idx) if self.num_vpt > 0 else self._forward(x, idx)
+        x = self.norm(x)
+        x = x[:, 1:, :]
+        x = rearrange(x, "b (h w) d -> b d h w", h=num_patches_h, w=num_patches_w)
+        return x
+    def refine(self, x: Tensor) -> Tensor:
+        return self.refiner(x)
+    def decode(self, x: Tensor) -> Tensor:
+        return self.decoder(x)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.encode(x)
+        x = self.refine(x)
+        x = self.decode(x)
+        return x
+def _vit(
+    model_name: str,
+    block_size: Optional[int] = None,
+    num_vpt: int = 32,
+    vpt_drop: float = 0.0,
+    input_size: Optional[Tuple[int, int]] = None,
+    norm: str = "none",
+    act: str = "none"
+) -> ViT:
+    model = ViT(
+        model_name=model_name,
+        block_size=block_size,
+        num_vpt=num_vpt,
+        vpt_drop=vpt_drop,
+        input_size=input_size,
+        norm=norm,
+        act=act
+    )
+    return model

models/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from torch import nn
+from typing import Optional
+from functools import partial
+from .utils import _init_weights, interpolate_pos_embed
+from .blocks import DepthSeparableConv2d, conv1x1, conv3x3, Conv2dLayerNorm
+from .refine import ConvRefine, LightConvRefine, LighterConvRefine
+from .downsample import ConvDownsample, LightConvDownsample, LighterConvDownsample
+from .upsample import ConvUpsample, LightConvUpsample, LighterConvUpsample
+from .multi_scale import MultiScale
+from .blocks import ConvAdapter, ViTAdapter
+def _get_norm_layer(model: nn.Module) -> Optional[nn.Module]:
+    for module in model.modules():
+        if isinstance(module, nn.BatchNorm2d):
+            return nn.BatchNorm2d
+        elif isinstance(module, nn.GroupNorm):
+            num_groups = module.num_groups
+            return partial(nn.GroupNorm, num_groups=num_groups)
+        elif isinstance(module, (nn.LayerNorm, Conv2dLayerNorm)):
+            return Conv2dLayerNorm
+    return None
+def _get_activation(model: nn.Module) -> Optional[nn.Module]:
+    for module in model.modules():
+        if isinstance(module, nn.BatchNorm2d):
+            return nn.ReLU(inplace=True)
+        elif isinstance(module, nn.GroupNorm):
+            return nn.ReLU(inplace=True)
+        elif isinstance(module, (nn.LayerNorm, Conv2dLayerNorm)):
+            return nn.GELU()
+    return nn.GELU()
+__all__ = [
+    "_init_weights", "_check_norm_layer", "_check_activation",
+    "conv1x1",
+    "conv3x3",
+    "Conv2dLayerNorm",
+    "interpolate_pos_embed",
+    "DepthSeparableConv2d",
+    "ConvRefine",
+    "LightConvRefine",
+    "LighterConvRefine",
+    "ConvDownsample",
+    "LightConvDownsample",
+    "LighterConvDownsample",
+    "ConvUpsample",
+    "LightConvUpsample",
+    "LighterConvUpsample",
+    "MultiScale",
+    "ConvAdapter", "ViTAdapter",
+]

models/utils/blocks.py ADDED Viewed

	@@ -0,0 +1,617 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from typing import Callable, Optional, Sequence, Tuple, Union, List, List
+import warnings
+from .utils import _init_weights, _make_ntuple, _log_api_usage_once
+def conv3x3(
+    in_channels: int,
+    out_channels: int,
+    stride: int = 1,
+    groups: int = 1,
+    dilation: int = 1,
+    bias: bool = True,
+) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    conv = nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=bias,
+        dilation=dilation,
+    )
+    conv.apply(_init_weights)
+    return conv
+def conv1x1(
+    in_channels: int,
+    out_channels: int,
+    stride: int = 1,
+    bias: bool = True,
+) -> nn.Conv2d:
+    """1x1 convolution"""
+    conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=bias)
+    conv.apply(_init_weights)
+    return conv
+class DepthSeparableConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: int = 0,
+        dilation: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+        # Depthwise convolution: one filter per input channel.
+        self.depthwise = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=bias,
+            padding_mode=padding_mode
+        )
+        # Pointwise convolution: combine the features across channels.
+        self.pointwise = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=bias,
+            padding_mode=padding_mode
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.pointwise(self.depthwise(x))
+class SEBlock(nn.Module):
+    def __init__(self, channels: int, reduction: int = 16):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channels, channels // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channels // reduction, channels, bias=False),
+            nn.Sigmoid()
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        B, C, _, _ = x.shape
+        y = self.avg_pool(x).view(B, C)
+        y = self.fc(y).view(B, C, 1, 1)
+        return x * y
+class BasicBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        assert isinstance(groups, int) and groups > 0, f"Expected groups to be a positive integer, but got {groups}"
+        assert in_channels % groups == 0, f"Expected in_channels to be divisible by groups, but got {in_channels} % {groups}"
+        assert out_channels % groups == 0, f"Expected out_channels to be divisible by groups, but got {out_channels} % {groups}"
+        self.grouped_conv = groups > 1
+        self.conv1 = conv3x3(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=1,
+            bias=not norm_layer,
+            groups=groups,
+        )
+        if self.grouped_conv:
+            self.conv1_1x1 = conv1x1(out_channels, out_channels, stride=1, bias=not norm_layer)
+        self.norm1 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = conv3x3(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            stride=1,
+            bias=not norm_layer,
+            groups=groups,
+        )
+        if self.grouped_conv:
+            self.conv2_1x1 = conv1x1(out_channels, out_channels, stride=1, bias=not norm_layer)
+        self.norm2 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        if in_channels != out_channels:
+            self.downsample = nn.Sequential(
+                conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+                norm_layer(out_channels) if norm_layer else nn.Identity(),
+            )
+        else:
+            self.downsample = nn.Identity()
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.conv1_1x1(out) if self.grouped_conv else out
+        out = self.norm1(out)
+        out = self.act1(out)
+        out = self.conv2(out)
+        out = self.conv2_1x1(out) if self.grouped_conv else out
+        out = self.norm2(out)
+        out += self.downsample(identity)
+        out = self.act2(out)
+        return out
+class LightBasicBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        self.conv1 = DepthSeparableConv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not norm_layer,
+        )
+        self.norm1 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = DepthSeparableConv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not norm_layer,
+        )
+        self.norm2 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        if in_channels != out_channels:
+            self.downsample = nn.Sequential(
+                conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+                norm_layer(out_channels) if norm_layer else nn.Identity(),
+            )
+        else:
+            self.downsample = nn.Identity()
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.act1(out)
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out += self.downsample(identity)
+        out = self.act2(out)
+        return out
+class Bottleneck(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+        base_width: int = 64,
+        expansion: float = 2.0,
+    ) -> None:
+        super().__init__()
+        assert isinstance(groups, int) and groups > 0, f"Expected groups to be a positive integer, but got {groups}"
+        assert expansion > 0, f"Expected expansion to be greater than 0, but got {expansion}"
+        assert base_width > 0, f"Expected base_width to be greater than 0, but got {base_width}"
+        bottleneck_channels = int(in_channels * (base_width / 64.0) * expansion)
+        assert bottleneck_channels % groups == 0, f"Expected bottleneck_channels to be divisible by groups, but got {bottleneck_channels} % {groups}"
+        self.grouped_conv = groups > 1
+        self.expansion, self.base_width = expansion, base_width
+        self.conv_in = conv1x1(in_channels, bottleneck_channels, stride=1, bias=not norm_layer)
+        self.norm_in = norm_layer(bottleneck_channels)
+        self.act_in = activation
+        self.se_in = SEBlock(bottleneck_channels) if bottleneck_channels > in_channels else nn.Identity()
+        self.conv_block_1 = nn.Sequential(
+            conv3x3(
+                in_channels=bottleneck_channels,
+                out_channels=bottleneck_channels,
+                stride=1,
+                groups=groups,
+                bias=not norm_layer
+            ),
+            conv1x1(bottleneck_channels, bottleneck_channels, stride=1, bias=not norm_layer) if groups > 1 else nn.Identity(),
+            norm_layer(bottleneck_channels) if norm_layer else nn.Identity(),
+            activation,
+        )
+        self.conv_block_2 = nn.Sequential(
+            conv3x3(
+                in_channels=bottleneck_channels,
+                out_channels=bottleneck_channels,
+                stride=1,
+                groups=groups,
+                bias=not norm_layer
+            ),
+            conv1x1(bottleneck_channels, bottleneck_channels, stride=1, bias=not norm_layer) if groups > 1 else nn.Identity(),
+            norm_layer(bottleneck_channels) if norm_layer else nn.Identity(),
+            activation,
+        )
+        self.conv_out = conv1x1(bottleneck_channels, out_channels, stride=1, bias=not norm_layer)
+        self.norm_out = norm_layer(out_channels)
+        self.act_out = activation
+        self.se_out = SEBlock(out_channels) if out_channels > bottleneck_channels else nn.Identity()
+        if in_channels != out_channels:
+            self.downsample = nn.Sequential(
+                conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+                norm_layer(out_channels) if norm_layer else nn.Identity(),
+            )
+        else:
+            self.downsample = nn.Identity()
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        # expand
+        out = self.conv_in(x)
+        out = self.norm_in(out)
+        out = self.act_in(out)
+        out = self.se_in(out)
+        # conv
+        out = self.conv_block_1(out)
+        out = self.conv_block_2(out)
+        # reduce
+        out = self.conv_out(out)
+        out = self.norm_out(out)
+        out = self.se_out(out)
+        out += self.downsample(identity)
+        out = self.act_out(out)
+        return out
+class ConvASPP(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dilations: List[int] = [1, 2, 4],
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+        base_width: int = 64,
+        expansion: float = 2.0,
+    ) -> None:
+        super().__init__()
+        assert isinstance(groups, int) and groups > 0, f"Expected groups to be a positive integer, but got {groups}"
+        assert expansion > 0, f"Expected expansion to be greater than 0, but got {expansion}"
+        assert base_width > 0, f"Expected base_width to be greater than 0, but got {base_width}"
+        bottleneck_channels = int(in_channels * (base_width / 64.0) * expansion)
+        assert bottleneck_channels % groups == 0, f"Expected bottleneck_channels to be divisible by groups, but got {bottleneck_channels} % {groups}"
+        self.expansion, self.base_width = expansion, base_width
+        self.conv_in = conv1x1(in_channels, bottleneck_channels, stride=1, bias=not norm_layer)
+        self.norm_in = norm_layer(bottleneck_channels)
+        self.act_in = activation
+        conv_blocks = [nn.Sequential(
+            conv1x1(bottleneck_channels, bottleneck_channels, stride=1, bias=not norm_layer),
+            norm_layer(bottleneck_channels),
+            activation
+        )]
+        for dilation in dilations:
+            conv_blocks.append(nn.Sequential(
+                conv3x3(
+                in_channels=bottleneck_channels,
+                out_channels=bottleneck_channels,
+                stride=1,
+                groups=groups,
+                dilation=dilation,
+                bias=not norm_layer
+            ),
+            conv1x1(bottleneck_channels, bottleneck_channels, stride=1, bias=not norm_layer) if groups > 1 else nn.Identity(),
+                norm_layer(bottleneck_channels) if norm_layer else nn.Identity(),
+                activation
+            ))
+        self.convs = nn.ModuleList(conv_blocks)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv_avg = conv1x1(bottleneck_channels, bottleneck_channels, stride=1, bias=not norm_layer)
+        self.norm_avg = norm_layer(bottleneck_channels)
+        self.act_avg = activation
+        self.se = SEBlock(bottleneck_channels * (len(dilations) + 2))
+        self.conv_out = conv1x1(bottleneck_channels * (len(dilations) + 2), out_channels, stride=1, bias=not norm_layer)
+        self.norm_out = norm_layer(out_channels)
+        self.act_out = activation
+        if in_channels != out_channels:
+            self.downsample = nn.Sequential(
+                conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+                norm_layer(out_channels) if norm_layer else nn.Identity(),
+            )
+        else:
+            self.downsample = nn.Identity()
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        height, width = x.shape[-2:]
+        identity = x
+        # expand
+        out = self.conv_in(x)
+        out = self.norm_in(out)
+        out = self.act_in(out)
+        outs = []
+        for conv in self.convs:
+            outs.append(conv(out))
+        avg = self.avgpool(out)
+        avg = self.conv_avg(avg)
+        avg = self.norm_avg(avg)
+        avg = self.act_avg(avg)  # (B, C, 1, 1)
+        avg = avg.repeat(1, 1, height, width)
+        outs = torch.cat([*outs, avg], dim=1)  # (B, C * (len(dilations) + 2), H, W)
+        outs = self.se(outs)
+        # reduce
+        outs = self.conv_out(outs)
+        outs = self.norm_out(outs)
+        outs += self.downsample(identity)
+        outs = self.act_out(outs)
+        return outs
+class ViTBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int = 8,
+        dropout: float = 0.0,
+        mlp_ratio: float = 4.0,
+    ) -> None:
+        super().__init__()
+        assert embed_dim % num_heads == 0, f"Embedding dimension {embed_dim} should be divisible by number of heads {num_heads}"
+        self.embed_dim, self.num_heads  = embed_dim, num_heads
+        self.dropout, self.mlp_ratio = dropout, mlp_ratio
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.attn = nn.MultiheadAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(embed_dim, int(embed_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
+            nn.Linear(int(embed_dim * mlp_ratio), embed_dim),
+            nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 3, f"Expected input to have shape (B, N, C), but got {x.shape}"
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class Conv2dLayerNorm(nn.Sequential):
+    """
+    Layer normalization applied in a convolutional fashion.
+    """
+    def __init__(self, dim: int) -> None:
+        super().__init__(
+            Rearrange("B C H W -> B H W C"),
+            nn.LayerNorm(dim),
+            Rearrange("B H W C -> B C H W")
+        )
+        self.apply(_init_weights)
+class CvTAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int = 8,
+        dropout: float = 0.0,
+        q_stride: int = 1,  # controls downsampling rate
+        kv_stride: int = 1,
+    ) -> None:
+        super().__init__()
+        assert embed_dim % num_heads == 0, f"Embedding dimension {embed_dim} should be divisible by number of heads {num_heads}"
+        self.embed_dim, self.num_heads, self.dim_head = embed_dim, num_heads, embed_dim // num_heads
+        self.scale = self.dim_head ** -0.5
+        self.q_stride, self.kv_stride = q_stride, kv_stride
+        self.attend = nn.Softmax(dim=-1)
+        self.dropout = nn.Dropout(dropout)
+        self.to_q = DepthSeparableConv2d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=3,
+            stride=q_stride,
+            padding=1,
+            bias=False
+        )
+        self.to_k = DepthSeparableConv2d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=3,
+            stride=kv_stride,
+            padding=1,
+            bias=False
+        )
+        self.to_v = DepthSeparableConv2d(
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            kernel_size=3,
+            stride=kv_stride,
+            padding=1,
+            bias=False
+        )
+        self.to_out = nn.Sequential(
+            conv1x1(embed_dim, embed_dim, stride=1),
+            nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
+        assert x.shape[1] == self.embed_dim, f"Expected input to have embedding dimension {self.embed_dim}, but got {x.shape[1]}"
+        q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
+        B, _, H, W = q.shape
+        q, k, v = map(lambda t: rearrange(t, "B (num_heads head_dim) H W -> (B num_heads) (H W) head_dim", num_heads=self.num_heads), (q, k, v))
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = self.attend(attn)
+        attn = self.dropout(attn)
+        out = attn @ v
+        out = rearrange(out, "(B num_heads) (H W) head_dim -> B (num_heads head_dim) H W", B=B, H=H, W=W, num_heads=self.num_heads)
+        out = self.to_out(out)
+        return out
+class CvTBlock(nn.Module):
+    """
+    Implement convolutional vision transformer block.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int = 8,
+        dropout: float = 0.0,
+        mlp_ratio: float = 4.0,
+        q_stride: int = 1,
+        kv_stride: int = 1,
+    ) -> None:
+        super().__init__()
+        assert embed_dim % num_heads == 0, f"Embedding dimension {embed_dim} should be divisible by number of heads {num_heads}."
+        self.embed_dim, self.num_heads = embed_dim, num_heads
+        self.norm1 = Conv2dLayerNorm(embed_dim)
+        self.attn = CvTAttention(embed_dim, num_heads, dropout, q_stride, kv_stride)
+        self.pool = nn.AvgPool2d(kernel_size=q_stride, stride=q_stride) if q_stride > 1 else nn.Identity()
+        self.norm2 = Conv2dLayerNorm(embed_dim)
+        self.mlp = nn.Sequential(
+            nn.Conv2d(embed_dim, int(embed_dim * mlp_ratio), kernel_size=1),
+            nn.GELU(),
+            nn.Dropout(dropout) if dropout > 0 else nn.Identity(),
+            nn.Conv2d(int(embed_dim * mlp_ratio), embed_dim, kernel_size=1),
+            nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.pool(x) + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class ConvAdapter(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        bottleneck_channels: int = 16,
+    ) -> None:
+        super().__init__()
+        assert in_channels > 0, f"Expected input_channels to be greater than 0, but got {in_channels}"
+        assert bottleneck_channels > 0, f"Expected bottleneck_channels to be greater than 0, but got {bottleneck_channels}"
+        self.adapter = nn.Sequential(
+            nn.Conv2d(in_channels, bottleneck_channels, kernel_size=1),
+            nn.GELU(),
+            nn.Conv2d(bottleneck_channels, in_channels, kernel_size=1),
+        )
+        nn.init.zeros_(self.adapter[2].weight)
+        nn.init.zeros_(self.adapter[2].bias)
+    def forward(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
+        return x + self.adapter(x)
+class ViTAdapter(nn.Module):
+    def __init__(self, input_dim, bottleneck_dim):
+        super().__init__()
+        self.adapter = nn.Sequential(
+            nn.Linear(input_dim, bottleneck_dim),
+            nn.GELU(), # ViT中常用GELU作为激活函数
+            nn.Linear(bottleneck_dim, input_dim)
+        )
+        nn.init.zeros_(self.adapter[2].weight)
+        nn.init.zeros_(self.adapter[2].bias)
+    def forward(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 3, f"Expected input to have shape (B, N, C), but got {x.shape}"
+        return x + self.adapter(x)

models/utils/carafe.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def carafe_forward(
+    features: torch.Tensor,
+    masks: torch.Tensor,
+    kernel_size: int,
+    group_size: int,
+    scale_factor: int
+) -> torch.Tensor:
+    """
+    Pure-PyTorch implementation of the CARAFE upsampling operator.
+    Args:
+        features (Tensor): Input feature map of shape (N, C, H, W).
+        masks (Tensor): Reassembly kernel weights of shape
+            (N, kernel_size*kernel_size*group_size, H_out, W_out),
+            where H_out = H*scale_factor and W_out = W*scale_factor.
+        kernel_size (int): The spatial size of the reassembly kernel.
+        group_size (int): The group size to divide channels. Must divide C.
+        scale_factor (int): The upsampling factor.
+    Returns:
+        Tensor: Upsampled feature map of shape (N, C, H*scale_factor, W*scale_factor).
+    """
+    N, C, H, W = features.size()
+    out_H, out_W = H * scale_factor, W * scale_factor
+    num_channels = C // group_size  # channels per group
+    # Reshape features to (N, group_size, num_channels, H, W)
+    features = features.view(N, group_size, num_channels, H, W)
+    # Merge batch and group dims for unfolding
+    features_reshaped = features.view(N * group_size, num_channels, H, W)
+    # Extract local patches; use padding so that output spatial dims match input
+    patches = F.unfold(features_reshaped, kernel_size=kernel_size,
+                       padding=(kernel_size - 1) // 2)
+    # patches shape: (N*group_size, num_channels*kernel_size*kernel_size, H*W)
+    # Reshape to (N, group_size, num_channels, kernel_size*kernel_size, H, W)
+    patches = patches.view(N, group_size, num_channels, kernel_size * kernel_size, H, W)
+    # Flatten spatial dimensions: now (N, group_size, num_channels, kernel_size*kernel_size, H*W)
+    patches = patches.view(N, group_size, num_channels, kernel_size * kernel_size, H * W)
+    # For each output pixel location, determine the corresponding base input index.
+    # For an output coordinate (oh, ow), the corresponding input index is:
+    #   h = oh // scale_factor, w = ow // scale_factor, linear index = h * W + w.
+    device = features.device
+    # Create coordinate indices for output
+    h_idx = torch.div(torch.arange(out_H, device=device), scale_factor, rounding_mode='floor')  # (out_H,)
+    w_idx = torch.div(torch.arange(out_W, device=device), scale_factor, rounding_mode='floor')  # (out_W,)
+    # Form a 2D grid of base indices (shape: out_H x out_W)
+    h_idx = h_idx.unsqueeze(1).expand(out_H, out_W)  # (out_H, out_W)
+    w_idx = w_idx.unsqueeze(0).expand(out_H, out_W)  # (out_H, out_W)
+    base_idx = (h_idx * W + w_idx).view(-1)  # (out_H*out_W,)
+    # Expand base_idx so that it can index the last dimension of patches:
+    # Desired shape for gathering: (N, group_size, num_channels, kernel_size*kernel_size, out_H*out_W)
+    base_idx = base_idx.view(1, 1, 1, 1, -1).expand(N, group_size, num_channels, kernel_size * kernel_size, -1)
+    # Gather patches corresponding to each output location
+    gathered_patches = torch.gather(patches, -1, base_idx)
+    # Reshape gathered patches to (N, group_size, num_channels, kernel_size*kernel_size, out_H, out_W)
+    gathered_patches = gathered_patches.view(N, group_size, num_channels, kernel_size * kernel_size, out_H, out_W)
+    # Reshape masks to separate groups.
+    # Expected mask shape: (N, kernel_size*kernel_size*group_size, out_H, out_W)
+    # Reshape to: (N, group_size, kernel_size*kernel_size, out_H, out_W)
+    masks = masks.view(N, group_size, kernel_size * kernel_size, out_H, out_W)
+    # For multiplication, add a channel dimension so that masks shape becomes
+    # (N, group_size, 1, kernel_size*kernel_size, out_H, out_W)
+    masks = masks.unsqueeze(2)
+    # Expand masks to match gathered_patches: (N, group_size, num_channels, kernel_size*kernel_size, out_H, out_W)
+    masks = masks.expand(-1, -1, num_channels, -1, -1, -1)
+    # Multiply patches with masks and sum over the kernel dimension.
+    # This yields the reassembled features for each output location.
+    out = (gathered_patches * masks).sum(dim=3)  # shape: (N, group_size, num_channels, out_H, out_W)
+    # Reshape back to (N, C, out_H, out_W)
+    out = out.view(N, C, out_H, out_W)
+    return out
+class CARAFE(nn.Module):
+    """
+    CARAFE: Content-Aware ReAssembly of Features
+    This PyTorch module implements the CARAFE upsampling operator in pure Python.
+    Given an input feature map and its corresponding reassembly masks, the module
+    reassembles features from local patches to produce a higher-resolution output.
+    Args:
+        kernel_size (int): Reassembly kernel size.
+        group_size (int): Group size for channel grouping (must divide number of channels).
+        scale_factor (int): Upsample ratio.
+    """
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super(CARAFE, self).__init__()
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+    def forward(self, features: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
+        return carafe_forward(features, masks, self.kernel_size, self.group_size, self.scale_factor)
+class CARAFEPack(nn.Module):
+    """
+    A unified package of the CARAFE upsampler that contains:
+    1) A channel compressor.
+    2) A content encoder that predicts reassembly masks.
+    3) The CARAFE operator.
+    This is modeled after the official CARAFE package.
+    Args:
+        channels (int): Number of input feature channels.
+        scale_factor (int): Upsample ratio.
+        up_kernel (int): Kernel size for the CARAFE operator.
+        up_group (int): Group size for the CARAFE operator.
+        encoder_kernel (int): Kernel size of the content encoder.
+        encoder_dilation (int): Dilation rate for the content encoder.
+        compressed_channels (int): Output channels for the channel compressor.
+    """
+    def __init__(
+        self,
+        channels: int,
+        scale_factor: int,
+        up_kernel: int = 5,
+        up_group: int = 1,
+        encoder_kernel: int = 3,
+        encoder_dilation: int = 1,
+        compressed_channels: int = 64
+    ):
+        super(CARAFEPack, self).__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        # Compress input channels.
+        self.channel_compressor = nn.Conv2d(channels, compressed_channels, kernel_size=1)
+        # Predict reassembly masks.
+        self.content_encoder = nn.Conv2d(
+            compressed_channels,
+            up_kernel * up_kernel * up_group * scale_factor * scale_factor,
+            kernel_size=encoder_kernel,
+            padding=int((encoder_kernel - 1) * encoder_dilation / 2),
+            dilation=encoder_dilation
+        )
+        # Initialize weights (using Xavier for conv layers).
+        nn.init.xavier_uniform_(self.channel_compressor.weight)
+        nn.init.xavier_uniform_(self.content_encoder.weight)
+        if self.channel_compressor.bias is not None:
+            nn.init.constant_(self.channel_compressor.bias, 0)
+        if self.content_encoder.bias is not None:
+            nn.init.constant_(self.content_encoder.bias, 0)
+    def kernel_normalizer(self, mask: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize and reshape the mask.
+        Applies pixel shuffle to upsample the predicted kernel weights and then
+        applies softmax normalization across the kernel dimension.
+        Args:
+            mask (Tensor): Predicted mask of shape (N, out_channels, H, W).
+        Returns:
+            Tensor: Normalized mask of shape (N, up_group * up_kernel^2, H*scale, W*scale).
+        """
+        # Pixel shuffle to rearrange and upsample the mask.
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        N, mask_c, H, W = mask.size()
+        # Determine the number of channels per kernel
+        mask_channel = mask_c // (self.up_kernel ** 2)
+        mask = mask.view(N, mask_channel, self.up_kernel ** 2, H, W)
+        mask = F.softmax(mask, dim=2)
+        mask = mask.view(N, mask_channel * self.up_kernel ** 2, H, W).contiguous()
+        return mask
+    def feature_reassemble(self, x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+        return carafe_forward(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+        out = self.feature_reassemble(x, mask)
+        return out
+# === Example Usage ===
+if __name__ == '__main__':
+    # Create dummy input: batch size 2, 64 channels, 32x32 spatial resolution.
+    x = torch.randn(2, 64, 32, 32).cuda()  # assuming GPU available
+    # Define CARAFEPack with upsample ratio 2.
+    # For example, use kernel size 5, group size 1.
+    upsampler = CARAFEPack(channels=64, scale_factor=2, up_kernel=5, up_group=1).cuda()
+    # Get upsampled feature map.
+    out = upsampler(x)
+    print("Input shape: ", x.shape)
+    print("Output shape:", out.shape)  # Expected shape: (2, 64, 64, 64)

models/utils/downsample.py ADDED Viewed

	@@ -0,0 +1,239 @@

+from torch import nn, Tensor
+from typing import Union
+from .blocks import DepthSeparableConv2d, conv1x1, conv3x3
+from .utils import _init_weights
+class ConvDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        assert isinstance(groups, int) and groups > 0, f"Number of groups should be an integer greater than 0, but got {groups}."
+        assert in_channels % groups == 0, f"Number of input channels {in_channels} should be divisible by number of groups {groups}."
+        assert out_channels % groups == 0, f"Number of output channels {out_channels} should be divisible by number of groups {groups}."
+        self.grouped_conv = groups > 1
+        # conv1 is used for downsampling
+        # self.conv1 = nn.Conv2d(
+        #     in_channels=in_channels,
+        #     out_channels=in_channels,
+        #     kernel_size=2,
+        #     stride=2,
+        #     padding=0,
+        #     bias=not norm_layer,
+        #     groups=groups,
+        # )
+        # if self.grouped_conv:
+        #     self.conv1_1x1 = conv1x1(in_channels, in_channels, stride=1, bias=not norm_layer)
+        self.conv1 = nn.AvgPool2d(kernel_size=2, stride=2)  # downsample by 2
+        if self.grouped_conv:
+            self.conv1_1x1 = nn.Identity()
+        self.norm1 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = conv3x3(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            stride=1,
+            groups=groups,
+            bias=not norm_layer,
+        )
+        if self.grouped_conv:
+            self.conv2_1x1 = conv1x1(in_channels, in_channels, stride=1, bias=not norm_layer)
+        self.norm2 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        self.conv3 = conv3x3(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=1,
+            groups=groups,
+            bias=not norm_layer,
+        )
+        if self.grouped_conv:
+            self.conv3_1x1 = conv1x1(out_channels, out_channels, stride=1, bias=not norm_layer)
+        self.norm3 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act3 = activation
+        self.downsample = nn.Sequential(
+            nn.AvgPool2d(kernel_size=2, stride=2),  # make sure the spatial sizes match
+            conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+            norm_layer(out_channels) if norm_layer else nn.Identity(),
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        # downsample
+        out = self.conv1(x)
+        out = self.conv1_1x1(out) if self.grouped_conv else out
+        out = self.norm1(out)
+        out = self.act1(out)
+        out = self.conv2(out)
+        out = self.conv2_1x1(out) if self.grouped_conv else out
+        out = self.norm2(out)
+        out = self.act2(out)
+        out = self.conv3(out)
+        out = self.conv3_1x1(out) if self.grouped_conv else out
+        out = self.norm3(out)
+        # shortcut
+        out += self.downsample(identity)
+        out = self.act3(out)
+        return out
+class LightConvDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        self.conv1 = DepthSeparableConv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=not norm_layer,
+        )
+        self.norm1 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = DepthSeparableConv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not norm_layer,
+        )
+        self.norm2 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        self.conv3 = DepthSeparableConv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not norm_layer,
+        )
+        self.norm3 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act3 = activation
+        self.downsample = nn.Sequential(
+            nn.AvgPool2d(kernel_size=2, stride=2),  # make sure the spatial sizes match
+            conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+            norm_layer(out_channels) if norm_layer else nn.Identity(),
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        # downsample
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.act1(out)
+        # refine 1
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out = self.act2(out)
+        # refine 2
+        out = self.conv3(out)
+        out = self.norm3(out)
+        # shortcut
+        out += self.downsample(identity)
+        out = self.act3(out)
+        return x
+class LighterConvDownsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        self.conv1 = DepthSeparableConv2d(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=not norm_layer,
+        )
+        self.norm1 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = conv3x3(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            stride=1,
+            groups=in_channels,
+            bias=not norm_layer,
+        )
+        self.norm2 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        self.conv3 = conv1x1(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=1,
+            bias=not norm_layer,
+        )
+        self.norm3 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act3 = activation
+        self.downsample = nn.Sequential(
+            nn.AvgPool2d(kernel_size=2, stride=2),  # make sure the spatial sizes match
+            conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+            norm_layer(out_channels) if norm_layer else nn.Identity(),
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        # downsample
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.act1(out)
+        # refine, depthwise conv
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out = self.act2(out)
+        # refine, pointwise conv
+        out = self.conv3(out)
+        out = self.norm3(out)
+        # shortcut
+        out += self.downsample(identity)
+        out = self.act3(out)
+        return out

models/utils/multi_scale.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+from torch import nn, Tensor
+from typing import List
+from einops import rearrange
+from .blocks import conv3x3, conv1x1, Conv2dLayerNorm, _init_weights
+class MultiScale(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        scales: List[int],
+        heads: int = 8,
+        groups: int = 1,
+        mlp_ratio: float = 4.0,
+    ) -> None:
+        super().__init__()
+        assert channels > 0, "channels should be a positive integer"
+        assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all([scale > 0 for scale in scales]), "scales should be a list or tuple of positive integers"
+        assert heads > 0 and channels % heads == 0, "heads should be a positive integer and channels should be divisible by heads"
+        assert groups > 0 and channels % groups == 0, "groups should be a positive integer and channels should be divisible by groups"
+        scales = sorted(scales)
+        self.scales = scales
+        self.num_scales = len(scales) + 1  # +1 for the original feature map
+        self.heads = heads
+        self.groups = groups
+        # modules that generate multi-scale feature maps
+        self.scale_0 = nn.Sequential(
+            conv1x1(channels, channels, stride=1, bias=False),
+            Conv2dLayerNorm(channels),
+            nn.GELU(),
+        )
+        for scale in scales:
+            setattr(self, f"conv_{scale}", nn.Sequential(
+                conv3x3(
+                    in_channels=channels,
+                    out_channels=channels,
+                    stride=1,
+                    groups=groups,
+                    dilation=scale,
+                    bias=False,
+                ),
+                conv1x1(channels, channels, stride=1, bias=False) if groups > 1 else nn.Identity(),
+                Conv2dLayerNorm(channels),
+                nn.GELU(),
+            ))
+        # modules that fuse multi-scale feature maps
+        self.norm_attn = Conv2dLayerNorm(channels)
+        self.pos_embed = nn.Parameter(torch.randn(1, self.num_scales + 1, channels, 1, 1) / channels ** 0.5)
+        self.to_q = conv1x1(channels, channels, stride=1, bias=False)
+        self.to_k = conv1x1(channels, channels, stride=1, bias=False)
+        self.to_v = conv1x1(channels, channels, stride=1, bias=False)
+        self.scale = (channels // heads) ** -0.5
+        self.attend = nn.Softmax(dim=-1)
+        self.to_out = conv1x1(channels, channels, stride=1)
+        # modules that refine multi-scale feature maps
+        self.norm_mlp = Conv2dLayerNorm(channels)
+        self.mlp = nn.Sequential(
+            conv1x1(channels, channels * mlp_ratio, stride=1),
+            nn.GELU(),
+            conv1x1(channels * mlp_ratio, channels, stride=1),
+        )
+        self.apply(_init_weights)
+    def _forward_attn(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
+        x = [self.scale_0(x)] + [getattr(self, f"conv_{scale}")(x) for scale in self.scales]
+        x = torch.stack(x, dim=1)  # (B, S, C, H, W)
+        x = torch.cat([x.mean(dim=1, keepdim=True), x], dim=1)  # (B, S+1, C, H, W)
+        x = x + self.pos_embed  # (B, S+1, C, H, W)
+        x = rearrange(x, "B S C H W -> (B S) C H W")  # (B*(S+1), C, H, W)
+        x = self.norm_attn(x)  # (B*(S+1), C, H, W)
+        x = rearrange(x, "(B S) C H W -> B S C H W", S=self.num_scales + 1)  # (B, S+1, C, H, W)
+        q = self.to_q(x[:, 0])  # (B, C, H, W)
+        k = self.to_k(rearrange(x, "B S C H W -> (B S) C H W"))
+        v = self.to_v(rearrange(x, "B S C H W -> (B S) C H W"))
+        q = rearrange(q, "B (h d) H W -> B h H W 1 d", h=self.heads)
+        k = rearrange(k, "(B S) (h d) H W -> B h H W S d", S=self.num_scales + 1, h=self.heads)
+        v = rearrange(v, "(B S) (h d) H W -> B h H W S d", S=self.num_scales + 1, h=self.heads)
+        attn = q @ k.transpose(-2, -1) * self.scale  # (B, h, H, W, 1, S+1)
+        attn = self.attend(attn)  # (B, h, H, W, 1, S+1)
+        out = attn @ v  # (B, h, H, W, 1, d)
+        out = rearrange(out, "B h H W 1 d -> B (h d) H W")  # (B, C, H, W)
+        out = self.to_out(out)  # (B, C, H, W)
+        return out
+    def _forward_mlp(self, x: Tensor) -> Tensor:
+        assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
+        x = self.norm_mlp(x)
+        x = self.mlp(x)
+        return x
+    def forward(self, x: Tensor) -> Tensor:
+        x = x + self._forward_attn(x)
+        x = x + self._forward_mlp(x)
+        return x

models/utils/refine.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from torch import nn, Tensor
+from typing import Union
+from .utils import _init_weights
+from .blocks import BasicBlock, LightBasicBlock, conv1x1, conv3x3
+class ConvRefine(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        self.refine = BasicBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_layer=norm_layer,
+            activation=activation,
+            groups=groups,
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.refine(x)
+class LightConvRefine(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        self.refine = LightBasicBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_layer=norm_layer,
+            activation=activation,
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.refine(x)
+class LighterConvRefine(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        # depthwise separable convolution
+        self.conv1 = conv3x3(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            stride=1,
+            groups=in_channels,
+            bias=not norm_layer,
+        )
+        self.norm1 = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.act1 = activation
+        self.conv2 = conv1x1(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=1,
+            bias=not norm_layer,
+        )
+        self.norm2 = norm_layer(out_channels) if norm_layer else nn.Identity()
+        self.act2 = activation
+        if in_channels != out_channels:
+            self.downsample = nn.Sequential(
+                conv1x1(in_channels, out_channels, stride=1, bias=not norm_layer),
+                norm_layer(out_channels) if norm_layer else nn.Identity(),
+            )
+        else:
+            self.downsample = nn.Identity()
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.act1(out)
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out += self.downsample(identity)
+        out = self.act2(out)
+        return out

models/utils/upsample.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from torch import nn, Tensor
+from torch.nn import functional as F
+from typing import Union
+from functools import partial
+from .utils import _init_weights
+from .refine import ConvRefine, LightConvRefine, LighterConvRefine
+class ConvUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        scale_factor: int = 2,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        assert scale_factor >= 1, f"Scale factor should be greater than or equal to 1, but got {scale_factor}"
+        self.scale_factor = scale_factor
+        self.upsample = partial(
+            F.interpolate,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+            recompute_scale_factor=False,
+            antialias=False,
+        ) if scale_factor > 1 else nn.Identity()
+        self.refine = ConvRefine(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_layer=norm_layer,
+            activation=activation,
+            groups=groups,
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.upsample(x)
+        x = self.refine(x)
+        return x
+class LightConvUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        scale_factor: int = 2,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        assert scale_factor >= 1, f"Scale factor should be greater than or equal to 1, but got {scale_factor}"
+        self.scale_factor = scale_factor
+        self.upsample = partial(
+            F.interpolate,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+            recompute_scale_factor=False,
+            antialias=False,
+        ) if scale_factor > 1 else nn.Identity()
+        self.refine = LightConvRefine(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_layer=norm_layer,
+            activation=activation,
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.upsample(x)
+        x = self.refine(x)
+        return x
+class LighterConvUpsample(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        scale_factor: int = 2,
+        norm_layer: Union[nn.BatchNorm2d, nn.GroupNorm, None] = nn.BatchNorm2d,
+        activation: nn.Module = nn.ReLU(inplace=True),
+    ) -> None:
+        super().__init__()
+        assert scale_factor >= 1, f"Scale factor should be greater than or equal to 1, but got {scale_factor}"
+        self.scale_factor = scale_factor
+        self.upsample = partial(
+            F.interpolate,
+            scale_factor=scale_factor,
+            mode="bilinear",
+            align_corners=False,
+            recompute_scale_factor=False,
+            antialias=False,
+        ) if scale_factor > 1 else nn.Identity()
+        self.refine = LighterConvRefine(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_layer=norm_layer,
+            activation=activation,
+        )
+        self.apply(_init_weights)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.upsample(x)
+        x = self.refine(x)
+        return x

models/utils/utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+from typing import Tuple, Any, Optional, Union
+from types import FunctionType
+from itertools import repeat
+from collections.abc import Iterable
+def _log_api_usage_once(obj: Any) -> None:
+    """
+    Logs API usage(module and name) within an organization.
+    In a large ecosystem, it's often useful to track the PyTorch and
+    TorchVision APIs usage. This API provides the similar functionality to the
+    logging module in the Python stdlib. It can be used for debugging purpose
+    to log which methods are used and by default it is inactive, unless the user
+    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
+    Please note it is triggered only once for the same API call within a process.
+    It does not collect any data from open-source users since it is no-op by default.
+    For more information, please refer to
+    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
+    * Logging policy: https://github.com/pytorch/vision/issues/5052;
+    Args:
+        obj (class instance or method): an object to extract info from.
+    """
+    module = obj.__module__
+    if not module.startswith("torchvision"):
+        module = f"torchvision.internal.{module}"
+    name = obj.__class__.__name__
+    if isinstance(obj, FunctionType):
+        name = obj.__name__
+    torch._C._log_api_usage_once(f"{module}.{name}")
+def _make_ntuple(x: Any, n: int) -> Tuple[Any, ...]:
+    """
+    Make n-tuple from input x. If x is an iterable, then we just convert it to tuple.
+    Otherwise, we will make a tuple of length n, all with value of x.
+    reference: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/utils.py#L8
+    Args:
+        x (Any): input value
+        n (int): length of the resulting tuple
+    """
+    if isinstance(x, Iterable):
+        return tuple(x)
+    return tuple(repeat(x, n))
+def _init_weights(model: nn.Module) -> None:
+    for m in model.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.)
+        elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm, nn.LayerNorm)):
+            nn.init.constant_(m.weight, 1.)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.)
+        elif isinstance(m, nn.Linear):
+            nn.init.normal_(m.weight, std=0.01)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0.)
+def interpolate_pos_embed(pos_embed: Tensor, size: Optional[Union[int, Tuple[int, int]]] = None, scale_factor: Optional[float] = None) -> Tensor:
+    assert len(pos_embed.shape) == 3, f"Positional embedding should be 3D tensor (C, H, W), but got {pos_embed.shape}."
+    return F.interpolate(
+        pos_embed.unsqueeze(0),
+        size=size,
+        scale_factor=scale_factor,
+        mode="bicubic",
+        align_corners=False,
+        antialias=True,
+    ).squeeze(0)