#including neccessary libraires import math import torch.nn as nn import torch #using the positional feed-forward network to expand dimension for model class position_wide_feed_forward(nn.Module): def __init__(self, dimension_for_model, dimension_for_network, dropout = 0.1): ''' A Constructor for the positional feed forward network dimensin_for_model: the manually decided dimension that's used in the embeddings layer dimension_for_network: the dimension needed to expand the embedded results into dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training ''' super().__init__() # Initializing the parent class - 'neural-networks' self.expansion = nn.Linear(dimension_for_model, dimension_for_network) # Expanding the original batch taken from the multi-head attention into newer ones with the desired dimensions self.apply_dropout = nn.Dropout(dropout) # Creating the dropout layer for improving the model's ability through testing and training by replacing specific rows and columns with 0s self.activation = nn.ReLU() # Introducing non-linearity into the encoder and allowing models to represent values non-linearly self.reverse_expansion = nn.Linear(dimension_for_network, dimension_for_model) # Reducing the dimension from expanded into original def forward(self, x): ''' Applying the process of the positional feed_forward function x: the data which the positional feed forward is applied to ''' parsed = self.apply_dropout(self.activation(self.expansion(x))) return self.reverse_expansion(parsed) # Using the layer_normalization to add outputs back and then normalize the layer class Residual_layer (nn.Module): def __init__(self, dimension_for_model, dropout = 0.1): ''' A Constructor for the Residual and Normalization Layer dropout: optional dropout to wipe out specific columns and rows of the matrix to improve the model's abilities during training dimension_for_model: The desired dimension from the embeddings layer ''' super().__init__() self.normalize = nn.LayerNorm(dimension_for_model) # Creating the layer normalization self.apply_dropout = nn.Dropout(dropout) def forward(self, input_tensor, sublayer_tensor): ''' input_tensor: the collection of tensor sum at the current stage sublayer_tensor: the tensor from the specific sublayer and still needed to be added ''' result = self.apply_dropout(sublayer_tensor)+input_tensor # Adding two results together, since both are of same dimension to enforce the positional arguments, but also apply dropout to the new tensor being added return self.normalize(result) # Return the normalized result if __name__ == '__main__': inp = torch.tensor([[ [1.0, 2.0, 3.0, 4.0], [0.5, 1.5, 2.5, 3.5], [4.0, 3.0, 2.0, 1.0] ]]) # Instantiate with no dropout ffn = position_wide_feed_forward(dimension_for_model=4, dimension_for_network=8, dropout=0.0) # Run it out = ffn(inp) # Print to verify shape and nontrivial transform print("Input:", inp) print("Output:", out) print("Output shape:", out.shape) x = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0], [0.5, 1.5, 2.5, 3.5]]]) # Dummy “sublayer” output to add sub = torch.tensor([[[0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2], [0.3, 0.3, 0.3, 0.3]]]) # Instantiate your residual+norm block (no dropout) layer = Residual_layer(dimension_for_model=4, dropout=0.0) # Run out = layer(x, sub) # Print everything print("Input X:\n", x) print("\nSublayer output:\n", sub) print("\nResidual+Norm output:\n", out)