import torch import math import torch.nn as nn # Including embeding layers for converting tokens into vector spaces class Embeddings(nn.Module): ''' The constructor for the embeddings class, initializing a look up table that corresponds each words in the vocabulary chain to a vector char: the amount of unique characters passed in dimension_for_model: the desired dimension of vector that's desired to pass the word to num_of_roles: the number of roles passed in ''' def __init__ (self, char, dimension_for_model, num_of_roles = 2, max_turns = 16): # Initializing parent function super(Embeddings, self).__init__() #creating an embedding layer and parsing the words into the matrix and dimension corresponding to the input self.lut = nn.Embedding(char, dimension_for_model) #stores data into look up table self.lut_roles = nn.Embedding (num_of_roles, dimension_for_model) #creating look up table for the number of roles self.lut_turns = nn.Embedding (max_turns, dimension_for_model) #creating look up table for the number of turns self.dimension_for_model = dimension_for_model #stores variable self.norm = nn.LayerNorm(dimension_for_model) #defining normalization methods ''' looks up the corresponding number from the look up table when numbers are passed in x: a tensor of token indices ''' def forward(self, x, roles, turns): var = self.lut(x) # Initialize the variable with the lookup table information of actual speaking content - parsed to words var = var + self.lut(roles) # Adding information about roles into the tensor var = var + self.lut(turns) # Adding information about speaking turn into the tensor # Normalizing the tensors var = var*math.sqrt(self.dimension_for_model) var = self.norm(var) return var if __name__ == '__main__': d_model = 512 # Desired model_dimension size definition # Instead of scoping in words, move down a scope for characters, which is unarguably more beneficial characters = list("abcdefghijklmnopqrstuvwxyz ") # Create a mapping from each character to its index. char2idx = {char: idx for idx, char in enumerate(characters)} vocab = len(characters) # The vocabulary size is the number of unique characters # Create a look-up table for each character(role/speaker) within the chat look_up_table_roles = {'system': 0, 'user': 1} # Example input string. input_str = "01 system: hello world" # Splitting the conversation, position and role information from a line position = int(input_str[0:2].strip()) input_str = input_str[2:] conversation = input_str.split(':')[1].strip() role = input_str.split(':')[0].strip() # Convert the input string into a list of indices # This filters out any character not in the vocabulary # Convert the roles into reference ids using the look up table conversation_indices = [char2idx[char] for char in conversation if char in char2idx] position_indices = [position for char in conversation if char in char2idx] role_indices = [look_up_table_roles[role] for char in conversation if char in char2idx] # Create tensors from the lists of indices. # Here we treat it as a batch with one sequence. conversations = torch.LongTensor([conversation_indices]) roles = torch.LongTensor([role_indices]) positions = torch.LongTensor([position_indices]) # Initialize the embedding layer using the character-level vocabulary size. emb = Embeddings(vocab, d_model) embr = emb(conversations, roles, positions) print("embr:", embr)