Spaces:

Yiming-M
/

ZIP

Running on Zero

App Files Files Community

ZIP / models /utils /multi_scale.py

Yiming-M

2025-07-31 18:59 🐣

a7dedf9 17 days ago

raw

history blame contribute delete

4.59 kB

	import torch
	from torch import nn, Tensor
	from typing import List
	from einops import rearrange

	from .blocks import conv3x3, conv1x1, Conv2dLayerNorm, _init_weights


	class MultiScale(nn.Module):
	def __init__(
	self,
	channels: int,
	scales: List[int],
	heads: int = 8,
	groups: int = 1,
	mlp_ratio: float = 4.0,
	) -> None:
	super().__init__()
	assert channels > 0, "channels should be a positive integer"
	assert isinstance(scales, (list, tuple)) and len(scales) > 0 and all([scale > 0 for scale in scales]), "scales should be a list or tuple of positive integers"
	assert heads > 0 and channels % heads == 0, "heads should be a positive integer and channels should be divisible by heads"
	assert groups > 0 and channels % groups == 0, "groups should be a positive integer and channels should be divisible by groups"
	scales = sorted(scales)
	self.scales = scales
	self.num_scales = len(scales) + 1 # +1 for the original feature map
	self.heads = heads
	self.groups = groups

	# modules that generate multi-scale feature maps
	self.scale_0 = nn.Sequential(
	conv1x1(channels, channels, stride=1, bias=False),
	Conv2dLayerNorm(channels),
	nn.GELU(),
	)
	for scale in scales:
	setattr(self, f"conv_{scale}", nn.Sequential(
	conv3x3(
	in_channels=channels,
	out_channels=channels,
	stride=1,
	groups=groups,
	dilation=scale,
	bias=False,
	),
	conv1x1(channels, channels, stride=1, bias=False) if groups > 1 else nn.Identity(),
	Conv2dLayerNorm(channels),
	nn.GELU(),
	))

	# modules that fuse multi-scale feature maps
	self.norm_attn = Conv2dLayerNorm(channels)
	self.pos_embed = nn.Parameter(torch.randn(1, self.num_scales + 1, channels, 1, 1) / channels ** 0.5)
	self.to_q = conv1x1(channels, channels, stride=1, bias=False)
	self.to_k = conv1x1(channels, channels, stride=1, bias=False)
	self.to_v = conv1x1(channels, channels, stride=1, bias=False)

	self.scale = (channels // heads) ** -0.5

	self.attend = nn.Softmax(dim=-1)

	self.to_out = conv1x1(channels, channels, stride=1)

	# modules that refine multi-scale feature maps
	self.norm_mlp = Conv2dLayerNorm(channels)
	self.mlp = nn.Sequential(
	conv1x1(channels, channels * mlp_ratio, stride=1),
	nn.GELU(),
	conv1x1(channels * mlp_ratio, channels, stride=1),
	)

	self.apply(_init_weights)

	def _forward_attn(self, x: Tensor) -> Tensor:
	assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
	x = [self.scale_0(x)] + [getattr(self, f"conv_{scale}")(x) for scale in self.scales]

	x = torch.stack(x, dim=1) # (B, S, C, H, W)
	x = torch.cat([x.mean(dim=1, keepdim=True), x], dim=1) # (B, S+1, C, H, W)
	x = x + self.pos_embed # (B, S+1, C, H, W)

	x = rearrange(x, "B S C H W -> (B S) C H W") # (B*(S+1), C, H, W)
	x = self.norm_attn(x) # (B*(S+1), C, H, W)
	x = rearrange(x, "(B S) C H W -> B S C H W", S=self.num_scales + 1) # (B, S+1, C, H, W)

	q = self.to_q(x[:, 0]) # (B, C, H, W)
	k = self.to_k(rearrange(x, "B S C H W -> (B S) C H W"))
	v = self.to_v(rearrange(x, "B S C H W -> (B S) C H W"))

	q = rearrange(q, "B (h d) H W -> B h H W 1 d", h=self.heads)
	k = rearrange(k, "(B S) (h d) H W -> B h H W S d", S=self.num_scales + 1, h=self.heads)
	v = rearrange(v, "(B S) (h d) H W -> B h H W S d", S=self.num_scales + 1, h=self.heads)

	attn = q @ k.transpose(-2, -1) * self.scale # (B, h, H, W, 1, S+1)
	attn = self.attend(attn) # (B, h, H, W, 1, S+1)
	out = attn @ v # (B, h, H, W, 1, d)

	out = rearrange(out, "B h H W 1 d -> B (h d) H W") # (B, C, H, W)

	out = self.to_out(out) # (B, C, H, W)
	return out

	def _forward_mlp(self, x: Tensor) -> Tensor:
	assert len(x.shape) == 4, f"Expected input to have shape (B, C, H, W), but got {x.shape}"
	x = self.norm_mlp(x)
	x = self.mlp(x)
	return x

	def forward(self, x: Tensor) -> Tensor:
	x = x + self._forward_attn(x)
	x = x + self._forward_mlp(x)
	return x