Spaces:

jev-aleks
/

SceneDINO

Running on Zero

App Files Files Community

SceneDINO / scenedino /models /backbones /dino /downsampler.py

jev-aleks

scenedino init

9e15541 15 days ago

raw

history blame contribute delete

3.76 kB

	import torch
	import torch.nn.functional as F
	import torchvision


	class BilinearDownsampler(torch.nn.Module):
	def __init__(
	self,
	patch_size,
	):
	super().__init__()
	if isinstance(patch_size, int):
	self.patch_size = (patch_size, patch_size)
	elif isinstance(patch_size, tuple):
	self.patch_size = patch_size

	def forward(self, x, mode):
	n, v, h, w, _, c = x.shape

	assert h % self.patch_size[0] == 0
	target_h = h // self.patch_size[0]
	assert w % self.patch_size[1] == 0
	target_w = w // self.patch_size[1]

	x = x.permute(0, 1, 4, 5, 2, 3).flatten(0, 2)
	x = F.interpolate(x, size=(target_h, target_w), mode="bilinear")
	x = x.reshape(n, v, -1, c, target_h, target_w).permute(0, 1, 4, 5, 2, 3)
	return x.squeeze(2, 3)


	class PatchSalienceDownsampler(torch.nn.Module):
	def __init__(
	self,
	channels,
	patch_size,
	normalize_features,
	):
	super().__init__()

	if isinstance(patch_size, int):
	self.patch_size = (patch_size, patch_size)
	elif isinstance(patch_size, tuple):
	self.patch_size = patch_size

	self.conv = torch.nn.Conv2d(channels, 1, kernel_size=1)
	self.patch_weight = torch.nn.Parameter(torch.ones(self.patch_size))
	self.patch_bias = torch.nn.Parameter(torch.zeros(self.patch_size))

	self.normalize_features = normalize_features

	torch.nn.init.kaiming_normal_(self.conv.weight, a=0, mode="fan_in")
	torch.nn.init.zeros_(self.conv.bias)
	torch.nn.init.normal_(self.patch_weight, mean=1.0, std=0.01)
	torch.nn.init.normal_(self.patch_bias, mean=0.0, std=0.01)

	def forward(self, x, mode):
	if mode == "patch":
	return self.forward_patches(x)

	elif mode == "image":
	n, v, h, w, _, c = x.shape
	patch_h, patch_w = self.patch_size[0], self.patch_size[1]
	no_patches_h, no_patches_w = h // patch_h, w // patch_w

	patches = x.reshape(n, v, no_patches_h, patch_h, no_patches_w, patch_w, 1, c)
	patches = patches.swapaxes(3, 4).flatten(1, 3)

	patched_result, salience_map, weight_map, patch_weight_bias = self.forward_patches(patches)
	patched_result = patched_result.reshape(n, v, no_patches_h, no_patches_w, 1, c)

	salience_map = salience_map.reshape(n, v, no_patches_h, no_patches_w, patch_h, patch_w, 1, 1)
	salience_map = salience_map.swapaxes(3, 4).reshape(n, v, h, w, 1, 1)

	weight_map = weight_map.reshape(n, v, no_patches_h, no_patches_w, patch_h, patch_w, 1, 1)
	weight_map = weight_map.swapaxes(3, 4).reshape(n, v, h, w, 1, 1)

	return patched_result, salience_map, weight_map, patch_weight_bias
	else:
	return None

	def forward_patches(self, x):
	n, p, patch_h, patch_w, _, c = x.shape
	x_flat = x.reshape(-1, patch_h, patch_w, c).permute(0, 3, 1, 2)

	salience_map = self.conv(x_flat).squeeze(1)

	weight_map = salience_map * self.patch_weight + self.patch_bias
	weight_map = torch.nn.functional.softmax(weight_map.reshape(-1, patch_h * patch_w), dim=1)
	weight_map = weight_map.reshape(n, p, patch_h, patch_w, 1, 1)

	patched_features = torch.sum(weight_map * x, dim=(2, 3))
	if self.normalize_features:
	patched_features = patched_features / torch.linalg.norm(patched_features, dim=-1, keepdim=True)

	return (patched_features,
	salience_map.reshape(n, p, patch_h, patch_w, 1, 1),
	weight_map,
	torch.cat([self.patch_weight, self.patch_bias], dim=1))