File size: 879 Bytes
6563ff2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import torch.nn as nn

from .usta_layer_norm import UstaLayerNorm
from .usta_mlp import UstaMLP
from .usta_multi_head_attention import UstaMultiHeadAttention


class UstaDecoderBlock(nn.Module):
  def __init__(self, embedding_dim, num_heads, context_length, device):
    super().__init__()

    self.self_attention = UstaMultiHeadAttention(
      embedding_dim,
      embedding_dim,
      context_length,
      num_heads,
      dropout_rate=0.5,
      device=device
    )
    self.norm1 = UstaLayerNorm(embedding_dim, device=device)
    self.mlp = UstaMLP(embedding_dim, embedding_dim, device=device)
    self.norm2 = UstaLayerNorm(embedding_dim, device=device)

  def forward(self, x):
    res = self.norm1(x)
    x = self.self_attention(x)
    x = self.norm1(x)

    x = x + res

    res = self.norm2(x)
    x = self.mlp(x)
    x = self.norm2(x)

    x = x + res

    return x