Spaces:

johannesschmude
/

surya_visual_forecasting_demo

Runtime error

App Files Files Community

surya_visual_forecasting_demo / surya /models /spectformer.py

johannesschmude

Initial commit

b73936d 5 days ago

raw

history blame

9.53 kB

	import math
	import logging
	from itertools import chain

	import torch
	import torch.nn as nn
	from torch.utils.checkpoint import checkpoint

	from timm.models.layers import DropPath, trunc_normal_
	import torch.fft

	from .transformer_ls import AttentionLS

	_logger = logging.getLogger(__name__)


	class Mlp(nn.Module):
	def __init__(
	self,
	in_features,
	hidden_features=None,
	out_features=None,
	act_layer=nn.GELU,
	drop=0.0,
	):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x


	class SpectralGatingNetwork(nn.Module):
	def __init__(self, dim, h=14, w=8):
	super().__init__()
	self.complex_weight = nn.Parameter(torch.randn(h, w, dim, 2) * 0.02)
	self.w = w
	self.h = h

	def forward(self, x, spatial_size=None):
	B, N, C = x.shape # torch.Size([1, 262144, 1024])
	if spatial_size is None:
	a = b = int(math.sqrt(N)) # a=b=512
	else:
	a, b = spatial_size

	x = x.view(B, a, b, C) # torch.Size([1, 512, 512, 1024])

	# FROM HERE USED TO BE AUTOCAST to float32
	dtype = x.dtype
	x = x.to(torch.float32)
	x = torch.fft.rfft2(
	x, dim=(1, 2), norm="ortho"
	) # torch.Size([1, 512, 257, 1024])
	weight = torch.view_as_complex(
	self.complex_weight.to(torch.float32)
	) # torch.Size([512, 257, 1024])
	x = x * weight
	x = torch.fft.irfft2(
	x, s=(a, b), dim=(1, 2), norm="ortho"
	) # torch.Size([1, 512, 512, 1024])
	x = x.to(dtype)

	x = x.reshape(B, N, C) # torch.Size([1, 262144, 1024])
	# UP TO HERE USED TO BE AUTOCAST to float32

	return x


	class BlockSpectralGating(nn.Module):
	def __init__(
	self,
	dim,
	mlp_ratio=4.0,
	drop=0.0,
	drop_path=0.0,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	h=14,
	w=8,
	):
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.filter = SpectralGatingNetwork(dim, h=h, w=w)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(
	in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop,
	)

	def forward(self, x, *args):
	x = x + self.drop_path(self.mlp(self.norm2(self.filter(self.norm1(x)))))
	return x


	class BlockAttention(nn.Module):
	def __init__(
	self,
	dim,
	num_heads: int = 8,
	mlp_ratio=4.0,
	drop=0.0,
	drop_path=0.0,
	w=2,
	dp_rank=2,
	act_layer=nn.GELU,
	norm_layer=nn.LayerNorm,
	rpe=False,
	adaLN=False,
	nglo=0,
	):
	"""
	num_heads: Attention heads. 4 for tiny, 8 for small and 12 for base
	"""
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(
	in_features=dim,
	hidden_features=mlp_hidden_dim,
	act_layer=act_layer,
	drop=drop,
	)
	self.attn = AttentionLS(
	dim=dim,
	num_heads=num_heads,
	w=w,
	dp_rank=dp_rank,
	nglo=nglo,
	rpe=rpe,
	)

	if adaLN:
	self.adaLN_modulation = nn.Sequential(
	nn.Linear(dim, dim, bias=True),
	act_layer(),
	nn.Linear(dim, 6 * dim, bias=True),
	)
	else:
	self.adaLN_modulation = None

	def forward(self, x: torch.Tensor, c: torch.Tensor) -> torch.Tensor:
	if self.adaLN_modulation is not None:
	(
	shift_mha,
	scale_mha,
	gate_mha,
	shift_mlp,
	scale_mlp,
	gate_mlp,
	) = self.adaLN_modulation(c).chunk(6, dim=2)
	else:
	shift_mha, scale_mha, gate_mha, shift_mlp, scale_mlp, gate_mlp = 6 * (1.0,)

	x = x + gate_mha * self.drop_path(
	self.attn(
	self.norm1(x) * scale_mha + shift_mha,
	)
	)
	x = x + gate_mlp * self.drop_path(
	self.mlp(self.norm2(x) * scale_mlp + shift_mlp)
	)

	return x


	class SpectFormer(nn.Module):
	def __init__(
	self,
	grid_size: int = 224 // 16,
	embed_dim=768,
	depth=12,
	n_spectral_blocks=4,
	num_heads: int = 8,
	mlp_ratio=4.0,
	uniform_drop=False,
	drop_rate=0.0,
	drop_path_rate=0.0,
	window_size=2,
	dp_rank=2,
	norm_layer=nn.LayerNorm,
	checkpoint_layers: list[int] \| None = None,
	rpe=False,
	ensemble: int \| None = None,
	nglo: int = 0,
	):
	"""
	Args:
	img_size (int, tuple): input image size
	patch_size (int, tuple): patch size
	embed_dim (int): embedding dimension
	depth (int): depth of transformer
	n_spectral_blocks (int): number of spectral gating blocks
	mlp_ratio (int): ratio of mlp hidden dim to embedding dim
	uniform_drop (bool): true for uniform, false for linearly increasing drop path probability.
	drop_rate (float): dropout rate
	drop_path_rate (float): drop path (stochastic depth) rate
	window_size: window size for long/short attention
	dp_rank: dp rank for long/short attention
	norm_layer: (nn.Module): normalization layer for attention blocks
	checkpoint_layers: indicate which layers to use for checkpointing
	rpe: Use relative position encoding in Long-Short attention blocks.
	ensemble: Integer indicating ensemble size or None for deterministic model.
	nglo: Number of (additional) global tokens.
	"""
	super().__init__()
	self.embed_dim = embed_dim
	self.n_spectral_blocks = n_spectral_blocks
	self._checkpoint_layers = checkpoint_layers or []
	self.ensemble = ensemble
	self.nglo = nglo

	h = grid_size
	w = h // 2 + 1

	if uniform_drop:
	_logger.info(f"Using uniform droppath with expect rate {drop_path_rate}.")
	dpr = [drop_path_rate for _ in range(depth)]
	else:
	_logger.info(
	f"Using linear droppath with expect rate {drop_path_rate * 0.5}."
	)
	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]

	self.blocks_spectral_gating = nn.ModuleList()
	self.blocks_attention = nn.ModuleList()
	for i in range(depth):
	if i < n_spectral_blocks:
	layer = BlockSpectralGating(
	dim=embed_dim,
	mlp_ratio=mlp_ratio,
	drop=drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer,
	h=h,
	w=w,
	)
	self.blocks_spectral_gating.append(layer)
	else:
	layer = BlockAttention(
	dim=embed_dim,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	drop=drop_rate,
	drop_path=dpr[i],
	norm_layer=norm_layer,
	w=window_size,
	dp_rank=dp_rank,
	rpe=rpe,
	adaLN=True if ensemble is not None else False,
	nglo=nglo,
	)
	self.blocks_attention.append(layer)

	self.apply(self._init_weights)

	def forward(self, tokens: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	tokens: Tensor of shape B, N, C for deterministic of BxE, N, C for ensemble forecast.
	Returns:
	Tensor of same shape as input.
	"""
	if self.ensemble:
	BE, N, C = tokens.shape
	noise = torch.randn(
	size=(BE, N, C), dtype=tokens.dtype, device=tokens.device
	)
	else:
	noise = None

	for i, blk in enumerate(
	chain(self.blocks_spectral_gating, self.blocks_attention)
	):
	if i in self._checkpoint_layers:
	tokens = checkpoint(blk, tokens, noise, use_reentrant=False)
	else:
	tokens = blk(tokens, noise)

	return tokens

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=0.02)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)