Spaces:

jev-aleks
/

SceneDINO

Running on Zero

App Files Files Community

SceneDINO / scenedino /models /prediction_heads /mlp.py

jev-aleks

scenedino init

9e15541 15 days ago

raw

history blame contribute delete

31.3 kB

	from typing import Optional

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from scenedino.common import util


	class ImplicitNet(nn.Module):
	"""
	Represents a MLP;
	Original code from IGR
	"""

	def __init__(
	self,
	d_in,
	dims,
	skip_in=(),
	d_out=4,
	geometric_init=True,
	radius_init=0.3,
	beta=0.0,
	output_init_gain=2.0,
	num_position_inputs=3,
	sdf_scale=1.0,
	dim_excludes_skip=False,
	combine_layer=1000,
	combine_type="average",
	):
	"""
	:param d_in input size
	:param dims dimensions of hidden layers. Num hidden layers == len(dims)
	:param skip_in layers with skip connections from input (residual)
	:param d_out output size
	:param geometric_init if true, uses geometric initialization
	(to SDF of sphere)
	:param radius_init if geometric_init, then SDF sphere will have
	this radius
	:param beta softplus beta, 100 is reasonable; if <=0 uses ReLU activations instead
	:param output_init_gain output layer normal std, only used for
	output dimension >= 1, when d_out >= 1
	:param dim_excludes_skip if true, dimension sizes do not include skip
	connections
	"""
	super().__init__()

	dims = [d_in] + dims + [d_out]
	if dim_excludes_skip:
	for i in range(1, len(dims) - 1):
	if i in skip_in:
	dims[i] += d_in

	self.num_layers = len(dims)
	self.skip_in = skip_in
	self.dims = dims
	self.combine_layer = combine_layer
	self.combine_type = combine_type

	for layer in range(0, self.num_layers - 1):
	if layer + 1 in skip_in:
	out_dim = dims[layer + 1] - d_in
	else:
	out_dim = dims[layer + 1]
	lin = nn.Linear(dims[layer], out_dim)

	# if true preform geometric initialization
	if geometric_init:
	if layer == self.num_layers - 2:
	# Note our geometric init is negated (compared to IDR)
	# since we are using the opposite SDF convention:
	# inside is +
	nn.init.normal_(
	lin.weight[0],
	mean=-np.sqrt(np.pi) / np.sqrt(dims[layer]) * sdf_scale,
	std=0.00001,
	)
	nn.init.constant_(lin.bias[0], radius_init)
	if d_out > 1:
	# More than SDF output
	nn.init.normal_(lin.weight[1:], mean=0.0, std=output_init_gain)
	nn.init.constant_(lin.bias[1:], 0.0)
	else:
	nn.init.constant_(lin.bias, 0.0)
	nn.init.normal_(lin.weight, 0.0, np.sqrt(2) / np.sqrt(out_dim))
	if d_in > num_position_inputs and (layer == 0 or layer in skip_in):
	# Special handling for input to allow positional encoding
	nn.init.constant_(lin.weight[:, -d_in + num_position_inputs :], 0.0)
	else:
	nn.init.constant_(lin.bias, 0.0)
	nn.init.kaiming_normal_(lin.weight, a=0, mode="fan_in")

	setattr(self, "lin" + str(layer), lin)

	if beta > 0:
	self.activation = nn.Softplus(beta=beta)
	else:
	# Vanilla ReLU
	self.activation = nn.ReLU()

	def forward(self, x, combine_inner_dims=(1,)):
	"""
	:param x (..., d_in)
	:param combine_inner_dims Combining dimensions for use with multiview inputs.
	Tensor will be reshaped to (-1, combine_inner_dims, ...) and reduced using combine_type
	on dim 1, at combine_layer
	"""
	x_init = x
	for layer in range(0, self.num_layers - 1):
	lin = getattr(self, "lin" + str(layer))

	if layer == self.combine_layer:
	x = util.combine_interleaved(x, combine_inner_dims, self.combine_type)
	x_init = util.combine_interleaved(
	x_init, combine_inner_dims, self.combine_type
	)

	if layer < self.combine_layer and layer in self.skip_in:
	x = torch.cat([x, x_init], -1) / np.sqrt(2)

	x = lin(x)
	if layer < self.num_layers - 2:
	x = self.activation(x)

	return x

	@classmethod
	def from_conf(cls, conf, d_in, d_out):
	return cls(d_in=d_in, d_out=d_out, **conf)

	# @classmethod
	# def from_conf(cls, conf, d_in, **kwargs):
	# # PyHocon construction
	# return cls(
	# d_in,
	# conf.get_list("dims"),
	# skip_in=conf.get_list("skip_in"),
	# beta=conf.get_float("beta", 0.0),
	# dim_excludes_skip=conf.get_bool("dim_excludes_skip", False),
	# combine_layer=conf.get_int("combine_layer", 1000),
	# combine_type=conf.get_string("combine_type", "average"), # average \| max
	# **kwargs,
	# )


	"""
	GeoNeRF
	https://github.com/idiap/GeoNeRF/blob/e6249fdae5672853c6bbbd4ba380c4c166d02c95/model/self_attn_renderer.py#L60
	"""


	# Custom TransposeLayer to perform transpose operation
	class TransposeLayer(nn.Module):
	def __init__(self):
	super(TransposeLayer, self).__init__()

	def forward(self, x):
	print("x_shape before transpose: ", x.shape)
	return x.transpose(1, 2)


	#
	# class CNN2AE(nn.Module):
	# def __init__(self, num_channels, num_features, desired_spatial_output): ## reduced mapping: num_points \|-> num_features
	# super(CNN2AE, self).__init__()
	# self.conv1 = nn.Conv1d(num_channels, num_channels*2, kernel_size=3, stride=1, padding=1)
	# self.conv2 = nn.Conv1d(num_channels2, num_channels4, kernel_size=3, stride=1, padding=1)
	# self.conv3 = nn.Conv1d(num_channels4, num_channels8, kernel_size=3, stride=1, padding=1)
	# self.pool = nn.AvgPool1d(kernel_size=2, stride=2)
	# self.desired_spatial_output = desired_spatial_output
	# # self.fc = nn.Linear(num_channels4 num_features, num_features) # Fully connected layer to further reduce dimension
	# # self.fc = nn.Linear(num_channels4 (num_features // 4), num_channels) # Fully connected layer to reduce dimension
	#
	# def forward(self, x): ## input_tensor's shape: (batch_size=1, C=num_channels, M=num_points)
	# _, num_channels, num_features = x.shape
	# x = self.pool(nn.functional.relu(self.conv1(x)))
	# x = self.pool(nn.functional.relu(self.conv2(x)))
	# x = self.pool(nn.functional.relu(self.conv3(x)))
	# x = x.view(x.size(0), num_channels, self.desired_spatial_output) # Reshape to (batch_size, num_channels, reduced_features)
	# return x

	device = torch.device(
	"cuda" if torch.cuda.is_available() else "cpu"
	) # Use GPU if available, else CPU


	class CNN2AE(
	nn.Module
	): ## convolute density sampled features along a ray from end of cam's frustum to the end. ( n_coarse==16 x att_feat==32 x (8x8) )
	def __init__(self, num_channels: int = 32, num_features: int = 64):
	super(CNN2AE, self).__init__()
	self.n_coarse = num_features
	self.conv1 = nn.Conv1d(
	num_channels, num_channels, kernel_size=3, stride=1, padding=1
	)
	# self.conv2 = nn.Conv1d(num_channels2, num_channels4, kernel_size=3, stride=1, padding=1)
	self.pool = nn.AvgPool1d(kernel_size=2, stride=2)
	# self.fc = nn.Linear(num_channels * num_features, num_features) # Fully connected layer to further reduce dimension
	# self.fc = None # We will initialize this later

	def forward(self, x): ## , desired_spatial_output):
	assert (
	x.size(0) % self.n_coarse
	) == 0, f"__given points should be dividable by n_coarse: {self.n_coarse},but points given: {x.size(0)}"
	# x = x.to(device) # Move the input data to the device
	# B_, C_, M_ = x.shape # Get the new number of channels and points
	x = self.pool(F.relu(self.conv1(x))) # Apply first conv layer and pool
	x = self.pool(F.relu(self.conv1(x))) # Apply second conv layer and pool

	# if self.fc is None:
	# # Initialize the fully connected layer now that we know the input size
	# self.fc = nn.Linear(C_ * M_, C_ * desired_spatial_output).to(device)

	# x = x.view(B_, C_ * M_) # Reshape to (batch_size, C * M)
	# x = self.fc(x) # Apply fully connected layer
	# x = x.view(B_, C_, desired_spatial_output) # Reshape to (batch_size, num_channels, desired_spatial_output)
	return x


	## Auto-encoder network
	class ConvAutoEncoder(nn.Module): ## purpose: to enforce the geometric generalization
	def __init__(
	self, num_ch: int = 32, S_: int = 64
	): ## S:= Sequence length of the input tensor. i.e. nb_samples_per_ray
	super(ConvAutoEncoder, self).__init__()
	# Encoder
	self.conv1 = nn.Sequential(
	nn.Conv1d(num_ch, num_ch * 2, 3, stride=1, padding=1),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.LayerNorm(
	S_, elementwise_affine=False
	), ## RuntimeError: Given normalized_shape=[64], expected input with shape [*, 64], but got input of size[1, 64, 100000]
	nn.ELU(alpha=1.0, inplace=True),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.MaxPool1d(2),
	)
	self.conv2 = nn.Sequential(
	nn.Conv1d(num_ch * 2, num_ch * 4, 3, stride=1, padding=1),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.LayerNorm(S_ // 2, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.MaxPool1d(2),
	)
	self.conv3 = nn.Sequential(
	nn.Conv1d(num_ch * 4, num_ch * 4, 3, stride=1, padding=1),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.LayerNorm(S_ // 4, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	# TransposeLayer(), # Use the custom TransposeLayer to transpose the output
	nn.MaxPool1d(2),
	)

	# Decoder
	self.t_conv1 = nn.Sequential(
	nn.ConvTranspose1d(num_ch * 4, num_ch * 4, 4, stride=2, padding=1),
	nn.LayerNorm(S_ // 4, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	)
	self.t_conv2 = nn.Sequential(
	nn.ConvTranspose1d(num_ch * 8, num_ch * 2, 4, stride=2, padding=1),
	nn.LayerNorm(S_ // 2, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	)
	self.t_conv3 = nn.Sequential(
	nn.ConvTranspose1d(num_ch * 4, num_ch, 4, stride=2, padding=1),
	nn.LayerNorm(S_, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	)
	# Output
	self.conv_out = nn.Sequential(
	nn.Conv1d(num_ch * 2, num_ch, 3, stride=1, padding=1),
	nn.LayerNorm(S_, elementwise_affine=False),
	nn.ELU(alpha=1.0, inplace=True),
	)

	def forward(self, x):
	input = x
	x = self.conv1(x)
	conv1_out = x
	x = self.conv2(x)
	conv2_out = x
	x = self.conv3(x)

	x = self.t_conv1(x)
	x = self.t_conv2(torch.cat([x, conv2_out], dim=1))
	x = self.t_conv3(torch.cat([x, conv1_out], dim=1))

	x = self.conv_out(torch.cat([x, input], dim=1))

	return x


	"""
	Transformer encoder part from IBRNet network
	https://github.com/googleinterns/IBRNet/blob/master/ibrnet/mlp_network.py
	"""


	class ScaledDotProductAttention(nn.Module):
	"""Scaled Dot-Product Attention"""

	def __init__(self, temperature, attn_dropout=0.1):
	super().__init__()
	self.temperature = temperature
	# self.dropout = nn.Dropout(attn_dropout)

	def forward(self, q, k, v, mask=None):
	attn = torch.matmul(
	q / self.temperature, k.transpose(2, 3)
	) ### ?? [32768, 4, 7, 7]

	if mask is not None: ### [32768, 1, 7]
	mask = mask.unsqueeze(-1) ##
	mask = mask.expand(
	-1, attn.shape[1], -1, attn.shape[-1]
	) ## TODO: matrix should be investiated to validate the operator
	mask = 1.0 - (
	(1.0 - mask) * (1.0 - mask.transpose(-2, -1))
	) ### As being symmetric of the mask matrix => the info of masked info won't give result: 2 problems: 1) computation bottleneck demand, eval_batch_size=25000 decreasing (setup pipeline using smaller pipeline nerf.py)
	attn = attn.masked_fill(
	mask == 1, -1e9
	) ## masking should be done when the value of invalidity as boolean is 1 by making the value of element zero (numerical stability)
	# attn = attn * mask
	"""
	def masked_fill(self, mask, value):
	result = self.clone() # Start with a copy of the original data
	result[mask] = value # Replace values where the mask is true
	return result
	"""

	attn = F.softmax(attn, dim=-1)
	# attn = self.dropout(F.softmax(attn, dim=-1))
	output = torch.matmul(attn, v)

	return output, attn


	class PositionwiseFeedForward(nn.Module):
	"""A two-feed-forward-layer module"""

	def __init__(self, d_in, d_hid, dropout=0.1):
	super().__init__()
	self.w_1 = nn.Linear(d_in, d_hid) # position-wise
	self.w_2 = nn.Linear(d_hid, d_in) # position-wise
	self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
	# self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	residual = x

	x = self.w_2(F.relu(self.w_1(x)))
	# x = self.dropout(x)
	x += residual

	x = self.layer_norm(x)

	return x


	class PoswiseFF_emb4enc(nn.Module):
	"""A two-feed-forward-layer module (tailored to encoder for DFT model's input) inspired code from Transformer's encoder"""

	def __init__(self, d_in, d_hid, d_out, dropout=0.1):
	super().__init__()
	self.w_1 = nn.Linear(d_in, d_hid) # position-wise
	self.w_2 = nn.Linear(d_hid, d_out) # position-wise
	self.w_match = nn.Linear(d_in, d_out) # position-wise
	# self.post_layer_norm = nn.LayerNorm(d_out, eps=1e-6)
	self.pre_layer_norm = nn.LayerNorm(d_in, eps=1e-6)
	# self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	# embedding for residual input
	emb_residual = self.w_match(x)

	# Pre-layer normalization
	x = self.pre_layer_norm(x)

	# Transform the (normalized) input
	x = self.w_2(
	F.elu(self.w_1(x))
	) ## default: ReLU \| or F.leaky_relu, LeakyReLU used to handle dying gradients, espeically when dense outputs are expected, so that it wouldn't lose expressiveness for Transformer due to lack of info
	# x = self.dropout(x)

	# Post-layer normaliation
	# x = self.post_layer_norm(x)

	# Residual connection
	x += emb_residual

	return x


	class PreLNPositionwiseFeedForward(nn.Module):
	"""A two-feed-forward-layer module"""

	def __init__(self, d_in, d_hid, dropout=0.1):
	super().__init__()
	self.w_1 = nn.Linear(d_in, d_hid) # position-wise
	self.w_2 = nn.Linear(d_hid, d_in) # position-wise
	self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
	# self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	residual = x

	x = self.layer_norm(x)

	x = self.w_2(F.leaky_relu(self.w_1(x))) ## default: F.relu
	# x = self.dropout(x)
	x += residual

	return x


	def make_embedding_encoder(
	config, input_channels: int, output_channels: int
	) -> Optional[nn.Module]:
	emb_enc_type = config.get("type", "none")
	non_linearity = nn.ELU() # make configurable
	if emb_enc_type == "none":
	return None
	elif emb_enc_type == "pwf":
	return PoswiseFF_emb4enc(input_channels, 2 * output_channels, output_channels)
	elif emb_enc_type == "ff":
	return nn.Sequential(
	nn.Linear(input_channels, 2 * output_channels, bias=True),
	non_linearity,
	nn.Linear(2 * output_channels, output_channels, bias=True),
	) ## default: ReLU \| nn.LeakyReLU()
	elif emb_enc_type == "ffh":
	return nn.Sequential(
	nn.Linear(input_channels, output_channels, bias=True)
	) ## default: ReLU \| nn.LeakyReLU()
	elif emb_enc_type == "hpwf":
	return nn.Sequential( ## == mlp.PositionwiseFeedForward
	nn.Linear(input_channels, 2 * output_channels, bias=True),
	non_linearity,
	nn.LayerNorm(2 * output_channels, eps=1e-6),
	nn.Linear(2 * output_channels, output_channels, bias=True),
	)
	else:
	raise NotImplementedError(
	"__unrecognized input for emb_enc, not using an embedding encoder."
	)
	return None


	class MultiHeadAttention(nn.Module):
	"""Multi-Head Attention module"""

	def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
	super().__init__()

	self.n_head = n_head
	self.d_k = d_k
	self.d_v = d_v

	self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
	self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
	self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
	self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

	self.attention = ScaledDotProductAttention(temperature=d_k**0.5)

	# self.dropout = nn.Dropout(dropout)
	self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

	def forward(self, q, k, v, mask=None):
	d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
	sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

	residual = q

	# Pass through the pre-attention projection: b x lq x (n*dv)
	# Separate different heads: b x lq x n x dv
	q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
	k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
	v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

	# Transpose for attention dot product: b x n x lq x dv
	q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

	if mask is not None:
	mask = mask.unsqueeze(1) # For head axis broadcasting.

	q, attn = self.attention(q, k, v, mask=mask)

	# Transpose to move the head dimension back: b x lq x n x dv
	# Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
	q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
	# q = self.dropout(self.fc(q))
	q = self.fc(q)
	q += residual

	q = self.layer_norm(q)

	return q, attn


	class PreLNMultiHeadAttention(nn.Module):
	"""Multi-Head Attention module"""

	def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
	super().__init__()

	self.n_head = n_head
	self.d_k = d_k
	self.d_v = d_v

	self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
	self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
	self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
	self.fc = nn.Linear(n_head * d_v, d_model, bias=False)

	self.attention = ScaledDotProductAttention(temperature=d_k**0.5)

	# self.dropout = nn.Dropout(dropout)
	self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

	def forward(self, q, k, v, mask=None):
	d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
	sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)

	residual = q

	q = self.layer_norm(q)
	# Pass through the pre-attention projection: b x lq x (n*dv)
	# Separate different heads: b x lq x n x dv
	q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
	k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
	v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

	# Transpose for attention dot product: b x n x lq x dv
	q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)

	if mask is not None:
	mask = mask.unsqueeze(1) # For head axis broadcasting.

	q, attn = self.attention(q, k, v, mask=mask)

	# Transpose to move the head dimension back: b x lq x n x dv
	# Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
	q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
	# q = self.dropout(self.fc(q))
	q = self.fc(q)
	q += residual

	return q, attn


	class EncoderLayer(nn.Module):
	"""Compose with two layers"""

	def __init__(
	self, d_model, d_inner, n_head, d_k, d_v, dropout=0, pre_ln: bool = False
	):
	super(EncoderLayer, self).__init__()
	if pre_ln:
	self.slf_attn = PreLNMultiHeadAttention(
	n_head, d_model, d_k, d_v, dropout=dropout
	)
	self.pos_ffn = PreLNPositionwiseFeedForward(
	d_model, d_inner, dropout=dropout
	)
	else:
	self.slf_attn = MultiHeadAttention(
	n_head, d_model, d_k, d_v, dropout=dropout
	)
	self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)

	def forward(self, enc_input, slf_attn_mask=None):
	enc_output, enc_slf_attn = self.slf_attn(
	enc_input, enc_input, enc_input, mask=slf_attn_mask
	)
	enc_output = self.pos_ffn(enc_output)
	return enc_output, enc_slf_attn


	"""(modified) Transformer arch from Pytorch library
	to be compatible with nn.TransformerEncoder() as input arg"""


	class TrEnLayer(nn.Module):
	r"""
	Args:
	encoder_layer: an instance of the TransformerEncoderLayer() class (required).
	num_layers: the number of sub-encoder-layers in the encoder (required).
	norm: the layer normalization component (optional).
	enable_nested_tensor: if True, input will automatically convert to nested tensor
	(and convert back on output). This will improve the overall performance of
	TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
	"""

	def __init__(
	self,
	encoder_layer,
	num_layers,
	norm=None,
	enable_nested_tensor=True,
	mask_check=True,
	):
	super(TrEnLayer, self).__init__()
	# self.layers = nn.ModuleList([deepcopy(encoder_layer) for _ in range(num_layers)])
	self.layers = TTF._get_clones(encoder_layer, num_layers) ## deep copy
	self.num_layers = num_layers
	self.norm = norm
	self.enable_nested_tensor = enable_nested_tensor
	self.mask_check = mask_check

	def forward(
	self,
	src: torch.Tensor,
	mask: Optional[torch.Tensor] = None,
	src_key_padding_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	r"""Pass the input through the encoder layers in turn.

	Args:
	src: the sequence to the encoder (required).
	mask: the mask for the src sequence (optional).
	src_key_padding_mask: the mask for the src keys per batch (optional).

	Shape:
	see the docs in Transformer class.
	"""
	if src_key_padding_mask is not None:
	_skpm_dtype = src_key_padding_mask.dtype
	if _skpm_dtype != torch.bool and not torch.is_floating_point(
	src_key_padding_mask
	):
	raise AssertionError(
	"only bool and floating types of key_padding_mask are supported"
	)
	output = src
	convert_to_nested = False
	first_layer = self.layers[0]
	src_key_padding_mask_for_layers = src_key_padding_mask
	why_not_sparsity_fast_path = ""
	str_first_layer = "self.layers[0]"

	# if not isinstance(first_layer, EncoderLayer):
	# why_not_sparsity_fast_path = f"{str_first_layer} was not IBR EncoderLayer"
	# elif first_layer.norm_first :
	# why_not_sparsity_fast_path = f"{str_first_layer}.norm_first was True"
	# elif first_layer.training:
	# why_not_sparsity_fast_path = f"{str_first_layer} was in training mode"
	# elif not first_layer.self_attn.batch_first:
	# why_not_sparsity_fast_path = f" {str_first_layer}.self_attn.batch_first was not True"
	# elif not first_layer.self_attn._qkv_same_embed_dim:
	# why_not_sparsity_fast_path = f"{str_first_layer}.self_attn._qkv_same_embed_dim was not True"
	# elif not first_layer.activation_relu_or_gelu:
	# why_not_sparsity_fast_path = f" {str_first_layer}.activation_relu_or_gelu was not True"
	# elif not (first_layer.norm1.eps == first_layer.norm2.eps) :
	# why_not_sparsity_fast_path = f"{str_first_layer}.norm1.eps was not equal to {str_first_layer}.norm2.eps"
	# elif not src.dim() == 3:
	# why_not_sparsity_fast_path = f"input not batched; expected src.dim() of 3 but got {src.dim()}"
	# elif not self.enable_nested_tensor:
	# why_not_sparsity_fast_path = "enable_nested_tensor was not True"
	# elif src_key_padding_mask is None:
	# why_not_sparsity_fast_path = "src_key_padding_mask was None"
	# elif (((not hasattr(self, "mask_check")) or self.mask_check)
	# and not torch._nested_tensor_from_mask_left_aligned(src, src_key_padding_mask.logical_not())):
	# why_not_sparsity_fast_path = "mask_check enabled, and src and src_key_padding_mask was not left aligned"
	# elif output.is_nested:
	# why_not_sparsity_fast_path = "NestedTensor input is not supported"
	# elif mask is not None:
	# why_not_sparsity_fast_path = "src_key_padding_mask and mask were both supplied"
	# elif first_layer.self_attn.num_heads % 2 == 1:
	# why_not_sparsity_fast_path = "num_head is odd"
	# elif torch.is_autocast_enabled():
	# why_not_sparsity_fast_path = "autocast is enabled"
	#
	# if not why_not_sparsity_fast_path:
	# tensor_args = (
	# src,
	# first_layer.self_attn.in_proj_weight,
	# first_layer.self_attn.in_proj_bias,
	# first_layer.self_attn.out_proj.weight,
	# first_layer.self_attn.out_proj.bias,
	# first_layer.norm1.weight,
	# first_layer.norm1.bias,
	# first_layer.norm2.weight,
	# first_layer.norm2.bias,
	# first_layer.linear1.weight,
	# first_layer.linear1.bias,
	# first_layer.linear2.weight,
	# first_layer.linear2.bias,
	# )
	#
	# if torch.overrides.has_torch_function(tensor_args):
	# why_not_sparsity_fast_path = "some Tensor argument has_torch_function"
	# elif not (src.is_cuda or 'cpu' in str(src.device)):
	# why_not_sparsity_fast_path = "src is neither CUDA nor CPU"
	# elif torch.is_grad_enabled() and any(x.requires_grad for x in tensor_args):
	# why_not_sparsity_fast_path = ("grad is enabled and at least one of query or the "
	# "input/output projection weights or biases requires_grad")
	#
	# if (not why_not_sparsity_fast_path) and (src_key_padding_mask is not None):
	# convert_to_nested = True
	# output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
	# src_key_padding_mask_for_layers = None

	for mod in self.layers:
	# output = mod(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask_for_layers)
	output = mod(output, slf_attn_mask=src_key_padding_mask_for_layers)[0]

	if convert_to_nested:
	output = output.to_padded_tensor(0.0)

	if self.norm is not None:
	output = self.norm(output)

	return output


	# class TrEnLayer(torch.nn.Module):
	# def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
	# activation="relu", batch_first=True, norm_first=False,
	# activation_relu_or_gelu=True):
	# super(TransformerEncoderLayer, self).__init__()
	# self.self_attn = torch.nn.MultiheadAttention(d_model, nhead, dropout=dropout)
	# # Implementation of Feedforward model
	# self.linear1 = torch.nn.Linear(d_model, dim_feedforward)
	# self.dropout = torch.nn.Dropout(dropout)
	# self.linear2 = torch.nn.Linear(dim_feedforward, d_model)
	#
	# self.norm1 = torch.nn.LayerNorm(d_model)
	# self.norm2 = torch.nn.LayerNorm(d_model)
	# self.dropout1 = torch.nn.Dropout(dropout)
	# self.dropout2 = torch.nn.Dropout(dropout)
	#
	# # Legacy string support for activation function.
	# if isinstance(activation, str):
	# self.activation = _get_activation_fn(activation)
	# else:
	# self.activation = activation
	#
	# self.pos_ffn = PositionwiseFeedForward(d_model, dim_feedforward, dropout)
	#
	# self.self_attn.batch_first = batch_first
	# self.self_attn._qkv_same_embed_dim = True # assuming d_model is the same for query, key, value
	# self.norm_first = norm_first
	# self.activation_relu_or_gelu = activation_relu_or_gelu
	#
	# def forward(self, src, src_mask=None, src_key_padding_mask=None):
	# src2 = self.self_attn(src, src, src, attn_mask=src_mask,
	# key_padding_mask=src_key_padding_mask)[0]
	# if self.norm_first:
	# src = src + self.dropout1(src2)
	# src = self.norm1(src)
	# src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
	# src = src + self.dropout2(src2)
	# src = self.norm2(src)
	# else:
	# src = self.norm1(src)
	# src = src + self.dropout1(src2)
	# src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
	# src = self.norm2(src)
	# src = src + self.dropout2(src2)
	# return src

	# '''
	# c.f. nn.transformer.py
	# '''
	# def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
	# if activation == "relu":
	# return F.relu
	# elif activation == "gelu":
	# return F.gelu
	#
	# raise RuntimeError("activation should be relu/gelu, not {}".format(activation))
	#
	# def _get_clones(module, N):
	# return ModuleList([copy.deepcopy(module) for i in range(N)])