Spaces:

huaweilin
/

VTBench

Running on Zero

App Files Files Community

VTBench / src /vqvaes /xqgan /xqgan_model.py

huaweilin

update

14ce5a9 about 2 months ago

raw

history blame contribute delete

41.1 kB

	# Modified from:
	# taming-transformers: https://github.com/CompVis/taming-transformers
	# maskgit: https://github.com/google-research/maskgit
	from dataclasses import dataclass, field
	from typing import List

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from timm.models import create_model

	import sys, os
	from math import sqrt

	# current_dir = os.path.dirname(os.path.abspath(__file__))
	# project_root = os.path.abspath(os.path.join(current_dir, '../..'))
	#
	# sys.path.append(project_root)

	from .cliploss import ClipLoss
	from .quant import VectorQuantizer2
	from .lookup_free_quantize import LFQ
	from .dino_enc.dinov2 import DINOv2Encoder, DINOv2Decoder
	from .latent_perturbation import add_perturbation
	from datasets import Denormalize
	from datasets import Normalize as ImgNormalize

	import torch.distributed as tdist


	@dataclass
	class ModelArgs:
	codebook_size: int = 16384
	codebook_embed_dim: int = 8
	codebook_l2_norm: bool = True
	codebook_show_usage: bool = True
	commit_loss_beta: float = 0.25
	entropy_loss_ratio: float = 0.0

	encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
	decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
	z_channels: int = 256
	dropout_p: float = 0.0

	v_patch_nums: List[int] = field(
	default_factory=lambda: [1, 2, 3, 4, 5, 6, 8, 10, 13, 16]
	)
	enc_type: str = "cnn"
	dec_type: str = "cnn"
	semantic_guide: str = "dinov2"
	detail_guide: str = "clip"
	num_latent_tokens: int = 256
	encoder_model: str = "vit_small_patch14_dinov2.lvd142m"
	decoder_model: str = "vit_small_patch14_dinov2.lvd142m"
	abs_pos_embed: bool = False
	share_quant_resi: int = 4
	product_quant: int = 1
	codebook_drop: float = 0.0
	half_sem: bool = False
	start_drop: int = 1
	sem_loss_weight: float = 0.1
	detail_loss_weight: float = 0.1
	clip_norm: bool = False
	sem_loss_scale: float = 1.0
	detail_loss_scale: float = 1.0
	guide_type_1: str = "class"
	guide_type_2: str = "class"

	lfq: bool = False
	scale: float = 1.0
	soft_entropy: bool = True

	dependency_loss_weight: float = 0.0

	test_model: bool = False


	class VQModel(nn.Module):
	def __init__(
	self,
	config: ModelArgs,
	):
	super().__init__()
	self.config = config
	self.enc_type = config.enc_type
	self.dec_type = config.dec_type
	self.product_quant = config.product_quant
	self.half_sem = config.half_sem
	self.start_drop = config.start_drop
	self.clip_norm = config.clip_norm
	config.num_latent_tokens = (
	config.num_latent_tokens * config.product_quant
	) # scale num_latent_tokens for PQ

	if config.enc_type == "cnn":
	self.encoder = Encoder(
	ch_mult=config.encoder_ch_mult,
	z_channels=config.z_channels,
	dropout=config.dropout_p,
	)
	self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
	elif config.enc_type == "dinov2":
	self.encoder = DINOv2Encoder(
	in_channels=3,
	num_latent_tokens=config.num_latent_tokens,
	model_name=config.encoder_model, # 'vit_small_patch14_dinov2.lvd142m', #'vit_base_patch14_dinov2.lvd142m', #
	model_kwargs={"img_size": 256, "patch_size": 16, "drop_path_rate": 0.1},
	pretrained=True,
	tuning_method="full",
	tuning_kwargs={"r": 8},
	abs_pos_embed=config.abs_pos_embed,
	product_quant=config.product_quant,
	)
	self.quant_conv = nn.Conv2d(
	self.encoder.embed_dim, config.codebook_embed_dim, 1
	)
	else:
	raise NotImplementedError

	if config.dec_type == "cnn":
	self.decoder = Decoder(
	ch_mult=config.decoder_ch_mult,
	z_channels=config.z_channels,
	dropout=config.dropout_p,
	)
	self.post_quant_conv = nn.Conv2d(
	config.codebook_embed_dim, config.z_channels, 1
	)
	elif config.dec_type == "dinov2":
	self.decoder = DINOv2Decoder(
	in_channels=3,
	num_latent_tokens=config.num_latent_tokens // self.product_quant,
	model_name=config.decoder_model,
	model_kwargs={"img_size": 256, "patch_size": 16, "drop_path_rate": 0.1},
	pretrained=True,
	tuning_method="full",
	tuning_kwargs={"r": 8},
	to_pixel="linear",
	use_rope=False,
	cond_latent=False,
	abs_pos_embed=config.abs_pos_embed,
	)
	self.post_quant_conv = nn.Conv2d(
	config.codebook_embed_dim, self.decoder.embed_dim, 1
	)

	self.V = self.vocab_size = config.codebook_size * self.product_quant
	self.Cvae = config.codebook_embed_dim * self.product_quant
	if self.product_quant > 1:
	if len(config.v_patch_nums) == 1:
	self.quantizes = nn.ModuleList(
	[
	VectorQuantizer(
	config.codebook_size,
	config.codebook_embed_dim,
	config.commit_loss_beta,
	config.codebook_l2_norm,
	)
	for _ in range(self.product_quant)
	]
	)
	elif not config.lfq:
	self.quantizes = nn.ModuleList(
	[
	VectorQuantizer2(
	config.codebook_size,
	config.codebook_embed_dim,
	v_patch_nums=config.v_patch_nums,
	num_latent_tokens=config.num_latent_tokens
	// self.product_quant,
	share_quant_resi=config.share_quant_resi,
	codebook_drop=config.codebook_drop,
	)
	for _ in range(self.product_quant)
	]
	)
	else:
	self.quantizes = nn.ModuleList(
	[
	LFQ(
	config.codebook_size,
	config.codebook_embed_dim,
	v_patch_nums=config.v_patch_nums,
	num_latent_tokens=config.num_latent_tokens
	// self.product_quant,
	share_quant_resi=config.share_quant_resi,
	codebook_drop=config.codebook_drop,
	using_znorm=config.codebook_l2_norm,
	scale=config.scale,
	entropy_weight=config.entropy_loss_ratio,
	soft_entropy=config.soft_entropy,
	)
	for _ in range(self.product_quant)
	]
	)
	self.post_quant_conv = nn.Conv2d(
	config.codebook_embed_dim * self.product_quant,
	self.decoder.embed_dim,
	1,
	)
	else:
	if len(config.v_patch_nums) == 1:
	self.quantize = VectorQuantizer(
	config.codebook_size,
	config.codebook_embed_dim,
	config.commit_loss_beta,
	config.codebook_l2_norm,
	)
	elif not config.lfq:
	self.quantize = VectorQuantizer2(
	config.codebook_size,
	config.codebook_embed_dim,
	v_patch_nums=config.v_patch_nums,
	num_latent_tokens=config.num_latent_tokens,
	share_quant_resi=config.share_quant_resi,
	)
	else:
	self.quantize = LFQ(
	config.codebook_size,
	config.codebook_embed_dim,
	v_patch_nums=config.v_patch_nums,
	num_latent_tokens=config.num_latent_tokens,
	share_quant_resi=config.share_quant_resi,
	codebook_drop=config.codebook_drop,
	using_znorm=config.codebook_l2_norm,
	scale=config.scale,
	entropy_weight=config.entropy_loss_ratio,
	soft_entropy=config.soft_entropy,
	)

	self.codebook_embed_dim = config.codebook_embed_dim
	self.v_patch_nums = config.v_patch_nums
	self.codebook_drop = config.codebook_drop
	# Semantic loss to preserve dino semantics
	self.semantic_guide = config.semantic_guide
	self.denormalize = Denormalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
	self.normalize = ImgNormalize(
	mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
	)
	if self.semantic_guide == "dinov2":
	semantic_model = create_model(
	config.encoder_model,
	pretrained=True,
	img_size=256,
	patch_size=16,
	drop_path_rate=0.0,
	)
	semantic_model.eval()
	for param in semantic_model.parameters():
	param.requires_grad = False
	self.semantic_model = (
	semantic_model # torch.compile(semantic_model, mode='max-autotune')
	)

	local_loss = False
	gather_with_grad = True
	rank = tdist.get_rank()
	world_size = tdist.get_world_size()
	use_horovod = False
	sem_loss_scale = config.sem_loss_scale

	self.sem_loss_scale = sem_loss_scale
	self.semantic_loss = ClipLoss(
	local_loss=local_loss,
	gather_with_grad=gather_with_grad,
	cache_labels=True,
	rank=rank,
	world_size=world_size,
	use_horovod=use_horovod,
	)
	if not self.half_sem and self.product_quant > 1:
	self.sem_linear = nn.Conv2d(
	self.product_quant * config.codebook_embed_dim,
	config.codebook_embed_dim,
	1,
	)
	elif self.half_sem and self.product_quant == 1:
	self.sem_linear = nn.Conv2d(768, config.codebook_embed_dim // 2, 1)
	if self.enc_type == "cnn":
	self.sem_linear = torch.nn.Linear(384, config.codebook_embed_dim)

	self.sem_loss_weight = config.sem_loss_weight

	self.detail_guide = config.detail_guide
	if self.detail_guide != "none":
	detail_model = create_model(
	"vit_base_patch16_clip_224.openai",
	pretrained=True,
	img_size=256,
	patch_size=16,
	drop_path_rate=0.0,
	)
	detail_model.eval()
	for param in detail_model.parameters():
	param.requires_grad = False
	self.detail_model = detail_model

	self.detail_loss_scale = config.detail_loss_scale
	self.detail_loss = ClipLoss(
	local_loss=False,
	gather_with_grad=True,
	cache_labels=True,
	rank=tdist.get_rank(),
	world_size=tdist.get_world_size(),
	use_horovod=False,
	)
	self.detail_loss_weight = config.detail_loss_weight

	self.guide_type_1 = config.guide_type_1
	self.guide_type_2 = config.guide_type_2
	self.dependency_loss_weight = config.dependency_loss_weight

	self.test_mode = config.test_model

	if self.test_mode:
	self.eval()
	[p.requires_grad_(False) for p in self.parameters()]

	def finetune(self, enc_tuning_method, dec_tuning_method):
	self.encoder.finetine(enc_tuning_method)
	self.decoder.finetine(dec_tuning_method)

	def encode(self, x):
	h = self.encoder(x)
	if self.enc_type == "dinov2":
	b, l, c = h.shape
	if self.product_quant > 1:
	assert int(sqrt(l // self.product_quant)) ** 2 * self.product_quant == l
	h = h.view(b, l, 1, c)
	h = h.permute(0, 3, 1, 2)
	else:
	assert int(sqrt(l)) ** 2 == l
	h = h.view(b, int(sqrt(l)), int(sqrt(l)), c)
	h = h.permute(0, 3, 1, 2)
	h = self.quant_conv(h)
	return h

	def decode(self, quant, return_quant=False):
	quant = self.post_quant_conv(quant)
	if self.dec_type == "dinov2":
	quant = quant.flatten(2).permute(0, 2, 1)
	dec = self.decoder(quant)
	return dec

	def decode_code(
	self,
	code_b,
	):
	quant_b, usages, mean_vq_loss = self.quantize(code_b, ret_usages=True)
	dec = self.decode(quant_b)
	return dec

	def forward(self, input, epoch, alpha, beta, delta):
	h = self.encode(input)
	b, c, l, _ = h.shape
	if len(self.v_patch_nums) == 1:
	dropout_rand = None
	else:
	dropout_rand = torch.randint(
	self.start_drop, len(self.v_patch_nums) + 1, (b,)
	) # to fix dropout across quantizers, skip first start_drop-1 quantizers

	if self.product_quant > 1:
	h_list = h.chunk(chunks=self.product_quant, dim=2)
	(
	quant_list,
	usages_list,
	mean_vq_loss_list,
	commit_loss_list,
	entropy_list,
	) = ([], [], [], [], [])
	for i, h in enumerate(h_list):
	h = h.view(
	b,
	-1,
	int(sqrt(l // self.product_quant)),
	int(sqrt(l // self.product_quant)),
	)
	quant, usages, vq_loss, commit_loss, entropy_loss = self.quantizes[
	i
	].forward(h, ret_usages=True, dropout=dropout_rand)
	quant_list.append(quant)
	usages_list.append(usages)
	mean_vq_loss_list.append(vq_loss)
	commit_loss_list.append(commit_loss)
	entropy_list.append(entropy_loss)
	dependency_loss = self.dependency_loss_weight * orthogonal_cosine_loss(
	torch.mean(quant_list[0], dim=(2, 3)).contiguous(),
	torch.mean(quant_list[-1], dim=(2, 3)).contiguous(),
	)
	usages = [sum(us) / self.product_quant for us in zip(*usages_list)]
	mean_vq_loss = sum(mean_vq_loss_list) / self.product_quant
	mean_commit_loss = sum(commit_loss_list) / self.product_quant
	mean_entropy = sum(entropy_list) / self.product_quant
	quant = torch.cat(quant_list, dim=1)
	else:
	dependency_loss = 0.0
	quant, usages, mean_vq_loss, mean_commit_loss, mean_entropy = (
	self.quantize.forward(h, ret_usages=True, dropout=dropout_rand)
	)
	print(alpha, beta, delta)
	quant = add_perturbation(
	h,
	quant,
	self.quantize.z_channels,
	self.quantize.codebook_norm,
	self.quantize.embedding,
	alpha,
	beta,
	delta,
	)
	quant_list = [quant]

	dec = self.decode(quant)

	# normalize the inputs to dino's transform
	input = self.normalize(self.denormalize(input))
	if self.semantic_guide != "none":
	if self.guide_type_1 == "class":
	z_s = self.semantic_model(input)
	z_s = z_s[..., None, None]
	else:
	z_s = self.semantic_model.forward_features(input)[:, 1:, :]
	z_s = z_s.reshape(b, 768, 16, 16)
	if self.enc_type == "dinov2":
	z_s = self.quant_conv(z_s).contiguous()
	semantic_quant = quant_list[-1]
	z_s = torch.mean(z_s, dim=(2, 3)).contiguous()
	z_q_ = torch.mean(semantic_quant, dim=(2, 3)).contiguous()
	elif self.enc_type == "cnn":
	z_q_ = torch.mean(h, dim=(2, 3)).contiguous()
	z_s = self.sem_linear(z_s).contiguous()

	n_drop = int(b * self.codebook_drop)
	with torch.cuda.amp.autocast(enabled=False):
	sem_loss_scale = self.sem_loss_scale
	feat1 = z_s[n_drop:].float()
	feat2 = z_q_[n_drop:].float()
	if self.clip_norm:
	feat1 = feat1 / feat1.norm(dim=1, keepdim=True)
	feat2 = feat2 / feat2.norm(dim=1, keepdim=True)
	sem_loss_scale = (
	(epoch % 200) / 200 * (100 - sem_loss_scale) + sem_loss_scale
	if epoch < 200
	else 100
	)
	sem_loss = self.semantic_loss.forward(
	feat1, feat2, logit_scale=sem_loss_scale
	)
	sem_loss = sem_loss * self.sem_loss_weight
	else:
	sem_loss = None

	if self.detail_guide != "none":
	assert (
	self.guide_type_2 == "patch"
	), "current only accept patch for detail guide"
	if self.guide_type_2 == "class":
	z_d = self.detail_model(input)
	z_d = z_d[..., None, None]
	else:
	z_d = self.detail_model.forward_features(input)[:, 1:, :]
	z_d = z_d.reshape(b, 768, 16, 16)
	if self.enc_type == "dinov2":
	z_d = self.quant_conv(z_d).contiguous()
	detail_quant = quant_list[0]
	z_d = torch.mean(z_d, dim=(2, 3)).contiguous()
	z_q_ = torch.mean(detail_quant, dim=(2, 3)).contiguous()
	elif self.enc_type == "cnn":
	pass

	n_drop = int(b * self.codebook_drop)
	with torch.cuda.amp.autocast(enabled=False):
	detail_loss_scale = self.detail_loss_scale
	feat1 = z_d[n_drop:].float()
	feat2 = z_q_[n_drop:].float()
	if self.clip_norm:
	feat1 = feat1 / feat1.norm(dim=1, keepdim=True)
	feat2 = feat2 / feat2.norm(dim=1, keepdim=True)
	detail_loss_scale = (
	(epoch % 200) / 200 * (100 - detail_loss_scale)
	+ detail_loss_scale
	if epoch < 200
	else 100
	)
	detail_loss = self.detail_loss.forward(
	feat1, feat2, logit_scale=detail_loss_scale
	)
	detail_loss = detail_loss * self.detail_loss_weight
	else:
	detail_loss = None

	return (
	dec,
	(mean_vq_loss, mean_commit_loss, mean_entropy, usages),
	sem_loss,
	detail_loss,
	dependency_loss,
	)

	def img_to_reconstructed_img(
	self,
	x,
	last_one=True,
	) -> List[torch.Tensor]:
	h = self.encoder(x)
	if self.enc_type == "dinov2":
	b, l, c = h.shape
	if self.product_quant > 1:
	assert int(sqrt(l // self.product_quant)) ** 2 * self.product_quant == l
	h = h.view(b, l, 1, c)
	h = h.permute(0, 3, 1, 2)
	else:
	assert int(sqrt(l)) ** 2 == l
	h = h.view(b, int(sqrt(l)), int(sqrt(l)), c)
	h = h.permute(0, 3, 1, 2)
	f = self.quant_conv(h)

	if self.product_quant > 1:
	b, c, l, _ = f.shape
	f_list = f.chunk(chunks=self.product_quant, dim=2)
	f_list = [
	f.view(
	b,
	-1,
	int(sqrt(l // self.product_quant)),
	int(sqrt(l // self.product_quant)),
	)
	for f in f_list
	]
	if len(self.v_patch_nums) == 1:
	f_hats_list = [
	self.quantizes[i].f_to_idxBl_or_fhat(
	f, to_fhat=True, v_patch_nums=None
	)
	for i, f in enumerate(f_list)
	]
	else:
	f_hats_list = [
	self.quantizes[i].f_to_idxBl_or_fhat(
	f, to_fhat=True, v_patch_nums=self.v_patch_nums
	)
	for i, f in enumerate(f_list)
	]
	f_hats = [
	self.post_quant_conv(torch.cat(f_hats, dim=1))
	for f_hats in zip(*f_hats_list)
	]
	else:
	if len(self.v_patch_nums) == 1:
	ls_f_hat_BChw = self.quantize.f_to_idxBl_or_fhat(
	f, to_fhat=True, v_patch_nums=None
	)
	else:
	ls_f_hat_BChw = self.quantize.f_to_idxBl_or_fhat(
	f, to_fhat=True, v_patch_nums=self.v_patch_nums
	)
	f_hats = [self.post_quant_conv(f_hat) for f_hat in ls_f_hat_BChw]

	if self.dec_type == "dinov2":
	f_hats = [f_hat.flatten(2).permute(0, 2, 1) for f_hat in f_hats]

	if last_one:
	return self.decoder(f_hats[-1]).clamp_(-1, 1)
	else:
	return [self.decoder(f_hat).clamp_(-1, 1) for f_hat in f_hats]

	def img_to_sem_feat(
	self,
	x,
	) -> List[torch.Tensor]:
	h = self.encoder(x)
	if self.enc_type == "dinov2":
	b, l, c = h.shape
	if self.product_quant > 1:
	assert int(sqrt(l // self.product_quant)) ** 2 * self.product_quant == l
	h = h.view(b, l, 1, c)
	h = h.permute(0, 3, 1, 2)
	else:
	assert int(sqrt(l)) ** 2 == l
	h = h.view(b, int(sqrt(l)), int(sqrt(l)), c)
	h = h.permute(0, 3, 1, 2)
	f = self.quant_conv(h)

	b, c, l, _ = f.shape
	f_list = f.chunk(chunks=self.product_quant, dim=2)
	f_list = [
	f.view(
	b,
	-1,
	int(sqrt(l // self.product_quant)),
	int(sqrt(l // self.product_quant)),
	)
	for f in f_list
	]
	f_hats_list = [
	self.quantizes[i].f_to_idxBl_or_fhat(
	f, to_fhat=True, v_patch_nums=self.v_patch_nums
	)
	for i, f in enumerate(f_list)
	]

	z_q = f_hats_list[-1][
	-1
	] # torch.mean(f_hats_list[-1][-1], dim=(2, 3)).contiguous()
	return z_q

	def fhat_to_img(self, f_hat: torch.Tensor):
	f_hat = self.post_quant_conv(f_hat)
	if self.dec_type == "dinov2":
	f_hat = f_hat.flatten(2).permute(0, 2, 1)
	return self.decoder(f_hat).clamp_(-1, 1)

	def idxBl_to_var_input(self, gt_idx_Bl):
	if self.product_quant > 1:
	x_BLCv_wo_first_l_list = [
	self.quantizes[i].idxBl_to_var_input(gt_idx_Bl[i])
	for i in range(self.product_quant)
	]
	return torch.cat(x_BLCv_wo_first_l_list, dim=-1)
	else:
	return self.quantize.idxBl_to_var_input(gt_idx_Bl)

	def get_next_autoregressive_input(self, si, SN, f_hat, h_BChw):
	f_hat_list = f_hat.chunk(self.product_quant, dim=1)
	h_BChw_list = h_BChw.chunk(self.product_quant, dim=1)
	out_fhat_list, out_next_token_map_list = [], []
	for i, (f_hat, h_BChw) in enumerate(zip(f_hat_list, h_BChw_list)):
	out_fhat, out_next_token_map = self.quantizes[
	i
	].get_next_autoregressive_input(si, SN, f_hat, h_BChw)
	out_fhat_list.append(out_fhat)
	out_next_token_map_list.append(out_next_token_map)
	f_hat = torch.cat(out_fhat_list, dim=1)
	next_token_map = torch.cat(out_next_token_map_list, dim=1)
	return f_hat, next_token_map


	class Encoder(nn.Module):
	def __init__(
	self,
	in_channels=3,
	ch=128,
	ch_mult=(1, 1, 2, 2, 4),
	num_res_blocks=2,
	norm_type="group",
	dropout=0.0,
	resamp_with_conv=True,
	z_channels=256,
	):
	super().__init__()
	self.num_resolutions = len(ch_mult)
	self.num_res_blocks = num_res_blocks
	self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)

	# downsampling
	in_ch_mult = (1,) + tuple(ch_mult)
	self.conv_blocks = nn.ModuleList()
	for i_level in range(self.num_resolutions):
	conv_block = nn.Module()
	# res & attn
	res_block = nn.ModuleList()
	attn_block = nn.ModuleList()
	block_in = ch * in_ch_mult[i_level]
	block_out = ch * ch_mult[i_level]
	for _ in range(self.num_res_blocks):
	res_block.append(
	ResnetBlock(
	block_in, block_out, dropout=dropout, norm_type=norm_type
	)
	)
	block_in = block_out
	if i_level == self.num_resolutions - 1:
	attn_block.append(AttnBlock(block_in, norm_type))
	conv_block.res = res_block
	conv_block.attn = attn_block
	# downsample
	if i_level != self.num_resolutions - 1:
	conv_block.downsample = Downsample(block_in, resamp_with_conv)
	self.conv_blocks.append(conv_block)

	# middle
	self.mid = nn.ModuleList()
	self.mid.append(
	ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
	)
	self.mid.append(AttnBlock(block_in, norm_type=norm_type))
	self.mid.append(
	ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
	)

	# end
	self.norm_out = Normalize(block_in, norm_type)
	self.conv_out = nn.Conv2d(
	block_in, z_channels, kernel_size=3, stride=1, padding=1
	)

	def forward(self, x):
	h = self.conv_in(x)
	# downsampling
	for i_level, block in enumerate(self.conv_blocks):
	for i_block in range(self.num_res_blocks):
	h = block.res[i_block](h)
	if len(block.attn) > 0:
	h = block.attn[i_block](h)
	if i_level != self.num_resolutions - 1:
	h = block.downsample(h)

	# middle
	for mid_block in self.mid:
	h = mid_block(h)

	# end
	h = self.norm_out(h)
	h = nonlinearity(h)
	h = self.conv_out(h)
	return h


	class Decoder(nn.Module):
	def __init__(
	self,
	z_channels=256,
	ch=128,
	ch_mult=(1, 1, 2, 2, 4),
	num_res_blocks=2,
	norm_type="group",
	dropout=0.0,
	resamp_with_conv=True,
	out_channels=3,
	):
	super().__init__()
	self.num_resolutions = len(ch_mult)
	self.num_res_blocks = num_res_blocks

	block_in = ch * ch_mult[self.num_resolutions - 1]
	# z to block_in
	self.conv_in = nn.Conv2d(
	z_channels, block_in, kernel_size=3, stride=1, padding=1
	)

	# middle
	self.mid = nn.ModuleList()
	self.mid.append(
	ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
	)
	self.mid.append(AttnBlock(block_in, norm_type=norm_type))
	self.mid.append(
	ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
	)

	# upsampling
	self.conv_blocks = nn.ModuleList()
	for i_level in reversed(range(self.num_resolutions)):
	conv_block = nn.Module()
	# res & attn
	res_block = nn.ModuleList()
	attn_block = nn.ModuleList()
	block_out = ch * ch_mult[i_level]
	for _ in range(self.num_res_blocks + 1):
	res_block.append(
	ResnetBlock(
	block_in, block_out, dropout=dropout, norm_type=norm_type
	)
	)
	block_in = block_out
	if i_level == self.num_resolutions - 1:
	attn_block.append(AttnBlock(block_in, norm_type))
	conv_block.res = res_block
	conv_block.attn = attn_block
	# downsample
	if i_level != 0:
	conv_block.upsample = Upsample(block_in, resamp_with_conv)
	self.conv_blocks.append(conv_block)

	# end
	self.norm_out = Normalize(block_in, norm_type)
	self.conv_out = nn.Conv2d(
	block_in, out_channels, kernel_size=3, stride=1, padding=1
	)

	@property
	def last_layer(self):
	return self.conv_out.weight

	def forward(self, z):
	# z to block_in
	h = self.conv_in(z)

	# middle
	for mid_block in self.mid:
	h = mid_block(h)

	# upsampling
	for i_level, block in enumerate(self.conv_blocks):
	for i_block in range(self.num_res_blocks + 1):
	h = block.res[i_block](h)
	if len(block.attn) > 0:
	h = block.attn[i_block](h)
	if i_level != self.num_resolutions - 1:
	h = block.upsample(h)

	# end
	h = self.norm_out(h)
	h = nonlinearity(h)
	h = self.conv_out(h)
	return h


	class ResnetBlock(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels=None,
	conv_shortcut=False,
	dropout=0.0,
	norm_type="group",
	):
	super().__init__()
	self.in_channels = in_channels
	out_channels = in_channels if out_channels is None else out_channels
	self.out_channels = out_channels
	self.use_conv_shortcut = conv_shortcut

	self.norm1 = Normalize(in_channels, norm_type)
	self.conv1 = nn.Conv2d(
	in_channels, out_channels, kernel_size=3, stride=1, padding=1
	)
	self.norm2 = Normalize(out_channels, norm_type)
	self.dropout = nn.Dropout(dropout)
	self.conv2 = nn.Conv2d(
	out_channels, out_channels, kernel_size=3, stride=1, padding=1
	)

	if self.in_channels != self.out_channels:
	if self.use_conv_shortcut:
	self.conv_shortcut = nn.Conv2d(
	in_channels, out_channels, kernel_size=3, stride=1, padding=1
	)
	else:
	self.nin_shortcut = nn.Conv2d(
	in_channels, out_channels, kernel_size=1, stride=1, padding=0
	)

	def forward(self, x):
	h = x
	h = self.norm1(h)
	h = nonlinearity(h)
	h = self.conv1(h)
	h = self.norm2(h)
	h = nonlinearity(h)
	h = self.dropout(h)
	h = self.conv2(h)

	if self.in_channels != self.out_channels:
	if self.use_conv_shortcut:
	x = self.conv_shortcut(x)
	else:
	x = self.nin_shortcut(x)
	return x + h


	class AttnBlock(nn.Module):
	def __init__(self, in_channels, norm_type="group"):
	super().__init__()
	self.norm = Normalize(in_channels, norm_type)
	self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
	self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
	self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
	self.proj_out = nn.Conv2d(
	in_channels, in_channels, kernel_size=1, stride=1, padding=0
	)

	def forward(self, x):
	h_ = x
	h_ = self.norm(h_)
	q = self.q(h_)
	k = self.k(h_)
	v = self.v(h_)

	# compute attention
	b, c, h, w = q.shape
	q = q.reshape(b, c, h * w)
	q = q.permute(0, 2, 1) # b,hw,c
	k = k.reshape(b, c, h * w) # b,c,hw
	w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
	w_ = w_ * (int(c) ** (-0.5))
	w_ = F.softmax(w_, dim=2)

	# attend to values
	v = v.reshape(b, c, h * w)
	w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q)
	h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
	h_ = h_.reshape(b, c, h, w)

	h_ = self.proj_out(h_)

	return x + h_


	def nonlinearity(x):
	# swish
	return x * torch.sigmoid(x)


	def Normalize(in_channels, norm_type="group"):
	assert norm_type in ["group", "batch"]
	if norm_type == "group":
	return nn.GroupNorm(
	num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
	)
	elif norm_type == "batch":
	return nn.SyncBatchNorm(in_channels)


	class Upsample(nn.Module):
	def __init__(self, in_channels, with_conv):
	super().__init__()
	self.with_conv = with_conv
	if self.with_conv:
	self.conv = nn.Conv2d(
	in_channels, in_channels, kernel_size=3, stride=1, padding=1
	)

	def forward(self, x):
	x = F.interpolate(x, scale_factor=2.0, mode="nearest")
	if self.with_conv:
	x = self.conv(x)
	return x


	class Downsample(nn.Module):
	def __init__(self, in_channels, with_conv):
	super().__init__()
	self.with_conv = with_conv
	if self.with_conv:
	# no asymmetric padding in torch conv, must do it ourselves
	self.conv = nn.Conv2d(
	in_channels, in_channels, kernel_size=3, stride=2, padding=0
	)

	def forward(self, x):
	if self.with_conv:
	pad = (0, 1, 0, 1)
	x = F.pad(x, pad, mode="constant", value=0)
	x = self.conv(x)
	else:
	x = F.avg_pool2d(x, kernel_size=2, stride=2)
	return x


	def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
	flat_affinity = affinity.reshape(-1, affinity.shape[-1])
	flat_affinity /= temperature
	probs = F.softmax(flat_affinity, dim=-1)
	log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
	if loss_type == "softmax":
	target_probs = probs
	else:
	raise ValueError("Entropy loss {} not supported".format(loss_type))
	avg_probs = torch.mean(target_probs, dim=0)
	avg_entropy = -torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
	sample_entropy = -torch.mean(torch.sum(target_probs * log_probs, dim=-1))
	loss = sample_entropy - avg_entropy
	return loss


	class VectorQuantizer(nn.Module):

	def __init__(self, vocab_size=8192, z_channels=32, beta=0.25, codebook_norm=True):
	super().__init__()
	# parameters
	self.vocab_size = vocab_size
	self.z_channels = z_channels
	self.beta = beta
	self.codebook_norm = codebook_norm
	# self.restart_unused_codes = restart_unused_codes

	# embedding layer
	self.embedding = nn.Embedding(self.vocab_size, self.z_channels)
	self.embedding.weight.data.uniform_(
	-1.0 / self.vocab_size, 1.0 / self.vocab_size
	)
	if self.codebook_norm:
	self.embedding.weight.data = F.normalize(
	self.embedding.weight.data, p=2, dim=-1
	)

	self.register_buffer(
	"ema_vocab_hit_SV", torch.full((self.vocab_size,), fill_value=0.0)
	)
	self.record_hit = 0

	def no_weight_decay(self):
	return [
	"embedding.weight",
	]

	def forward(self, z, ret_usages=True, dropout=None):

	vocab_hit_V = torch.zeros(self.vocab_size, dtype=torch.float, device=z.device)

	# reshape z -> (batch, height * width, channel) and flatten
	z = torch.einsum("b c h w -> b h w c", z).contiguous()
	z_flattened = z.view(-1, self.z_channels)

	if self.codebook_norm:
	z = F.normalize(z, p=2, dim=-1)
	z_flattened = F.normalize(z_flattened, p=2, dim=-1)
	embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
	else:
	embedding = self.embedding.weight

	# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
	d = (
	torch.sum(z_flattened**2, dim=1, keepdim=True)
	+ torch.sum(embedding**2, dim=1)
	- 2
	* torch.einsum(
	"bd,dn->bn", z_flattened, torch.einsum("n d -> d n", embedding)
	)
	)

	# argmin find indices and embeddings
	min_encoding_indices = torch.argmin(d, dim=1)

	z_q = self.embedding(min_encoding_indices).view(z.shape)
	if self.codebook_norm:
	z_q = F.normalize(z_q, p=2, dim=-1)

	if ret_usages and self.training:
	hit_V = min_encoding_indices.bincount(minlength=self.vocab_size).float()
	handler = tdist.all_reduce(hit_V, async_op=True)
	handler.wait()
	if self.record_hit == 0:
	self.ema_vocab_hit_SV.copy_(hit_V)
	elif self.record_hit < 100:
	self.ema_vocab_hit_SV.mul_(0.9).add_(hit_V.mul(0.1))
	else:
	self.ema_vocab_hit_SV.mul_(0.99).add_(hit_V.mul(0.01))
	self.record_hit += 1
	vocab_hit_V.add_(hit_V)

	margin = (
	tdist.get_world_size()
	* (z.numel() / self.z_channels)
	/ self.vocab_size
	* 0.08
	)

	codebook_usage = (
	self.ema_vocab_hit_SV >= margin
	).float().mean().item() * 100

	# compute loss
	commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
	vq_loss = torch.mean((z_q - z.detach()) ** 2)

	# preserve gradients - "straight-through"
	z_q = z + (z_q - z).detach()

	# reshape back to match original input shape
	z_q = torch.einsum("b h w c -> b c h w", z_q)

	return z_q, [codebook_usage], vq_loss, commit_loss, 0.0

	def f_to_idxBl_or_fhat(
	self, z: torch.Tensor, to_fhat: bool, v_patch_nums
	): # z_BChw is the feature from inp_img_no_grad
	# reshape z -> (batch, height, width, channel) and flatten
	z = torch.einsum("b c h w -> b h w c", z).contiguous()
	z_flattened = z.view(-1, self.z_channels)
	# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z

	if self.codebook_norm:
	z = F.normalize(z, p=2, dim=-1)
	z_flattened = F.normalize(z_flattened, p=2, dim=-1)
	embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
	else:
	embedding = self.embedding.weight

	# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
	d = (
	torch.sum(z_flattened**2, dim=1, keepdim=True)
	+ torch.sum(embedding**2, dim=1)
	- 2
	* torch.einsum(
	"bd,dn->bn", z_flattened, torch.einsum("n d -> d n", embedding)
	)
	)

	# argmin find indices and embeddings
	min_encoding_indices = torch.argmin(d, dim=1)

	z_q = self.embedding(min_encoding_indices).view(z.shape)
	if self.codebook_norm:
	z_q = F.normalize(z_q, p=2, dim=-1)

	# reshape back to match original input shape
	z_q = torch.einsum("b h w c -> b c h w", z_q)

	f_hat_or_idx_Bl: List[torch.Tensor] = [z_q if to_fhat else min_encoding_indices]

	return f_hat_or_idx_Bl


	def orthogonal_cosine_loss(A, B):
	A_norm = A / A.norm(dim=1, keepdim=True)
	B_norm = B / B.norm(dim=1, keepdim=True)
	loss = (A_norm * B_norm).sum(dim=1).mean()
	return loss


	#################################################################################
	# VQ Model Configs #
	#################################################################################
	def VQ_8(**kwargs):
	return VQModel(
	ModelArgs(encoder_ch_mult=[1, 2, 2, 4], decoder_ch_mult=[1, 2, 2, 4], **kwargs)
	)


	def VQ_16(**kwargs):
	return VQModel(
	ModelArgs(
	encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs
	)
	)


	VQ_models = {"VQ-16": VQ_16, "VQ-8": VQ_8}

	if __name__ == "__main__":
	semantic_model = create_model(
	"vit_small_patch14_dinov2.lvd142m",
	pretrained=True,
	img_size=256,
	patch_size=16,
	drop_path_rate=0.0,
	)
	semantic_model.eval()