import torch.nn as nn from .usta_layer_norm import UstaLayerNorm from .usta_mlp import UstaMLP from .usta_multi_head_attention import UstaMultiHeadAttention class UstaDecoderBlock(nn.Module): def __init__(self, embedding_dim, num_heads, context_length, device): super().__init__() self.self_attention = UstaMultiHeadAttention( embedding_dim, embedding_dim, context_length, num_heads, dropout_rate=0.5, device=device ) self.norm1 = UstaLayerNorm(embedding_dim, device=device) self.mlp = UstaMLP(embedding_dim, embedding_dim, device=device) self.norm2 = UstaLayerNorm(embedding_dim, device=device) def forward(self, x): res = self.norm1(x) x = self.self_attention(x) x = self.norm1(x) x = x + res res = self.norm2(x) x = self.mlp(x) x = self.norm2(x) x = x + res return x