Spaces:

literallybannedfromcallingbob
/

Aegis-ATIS-Demo

Sleeping

App Files Files Community

Aegis-ATIS-Demo / built_transformer /embeddings.py

literallybannedfromcallingbob

updated

9622166 about 1 month ago

raw

history blame contribute delete

3.7 kB

	import torch
	import math
	import torch.nn as nn
	# Including embeding layers for converting tokens into vector spaces
	class Embeddings(nn.Module):
	'''
	The constructor for the embeddings class, initializing a look up table that corresponds each words in the vocabulary chain to a vector
	char: the amount of unique characters passed in
	dimension_for_model: the desired dimension of vector that's desired to pass the word to
	num_of_roles: the number of roles passed in
	'''
	def __init__ (self, char, dimension_for_model, num_of_roles = 2, max_turns = 16):
	# Initializing parent function
	super(Embeddings, self).__init__()
	#creating an embedding layer and parsing the words into the matrix and dimension corresponding to the input
	self.lut = nn.Embedding(char, dimension_for_model) #stores data into look up table
	self.lut_roles = nn.Embedding (num_of_roles, dimension_for_model) #creating look up table for the number of roles
	self.lut_turns = nn.Embedding (max_turns, dimension_for_model) #creating look up table for the number of turns
	self.dimension_for_model = dimension_for_model #stores variable
	self.norm = nn.LayerNorm(dimension_for_model) #defining normalization methods
	'''
	looks up the corresponding number from the look up table when numbers are passed in
	x: a tensor of token indices
	'''
	def forward(self, x, roles, turns):
	var = self.lut(x) # Initialize the variable with the lookup table information of actual speaking content - parsed to words
	var = var + self.lut(roles) # Adding information about roles into the tensor
	var = var + self.lut(turns) # Adding information about speaking turn into the tensor

	# Normalizing the tensors
	var = var*math.sqrt(self.dimension_for_model)
	var = self.norm(var)
	return var

	if __name__ == '__main__':
	d_model = 512 # Desired model_dimension size definition

	# Instead of scoping in words, move down a scope for characters, which is unarguably more beneficial
	characters = list("abcdefghijklmnopqrstuvwxyz ")

	# Create a mapping from each character to its index.
	char2idx = {char: idx for idx, char in enumerate(characters)}
	vocab = len(characters) # The vocabulary size is the number of unique characters

	# Create a look-up table for each character(role/speaker) within the chat
	look_up_table_roles = {'system': 0, 'user': 1}

	# Example input string.
	input_str = "01 system: hello world"

	# Splitting the conversation, position and role information from a line
	position = int(input_str[0:2].strip())
	input_str = input_str[2:]
	conversation = input_str.split(':')[1].strip()
	role = input_str.split(':')[0].strip()

	# Convert the input string into a list of indices
	# This filters out any character not in the vocabulary
	# Convert the roles into reference ids using the look up table
	conversation_indices = [char2idx[char] for char in conversation if char in char2idx]
	position_indices = [position for char in conversation if char in char2idx]
	role_indices = [look_up_table_roles[role] for char in conversation if char in char2idx]

	# Create tensors from the lists of indices.
	# Here we treat it as a batch with one sequence.
	conversations = torch.LongTensor([conversation_indices])
	roles = torch.LongTensor([role_indices])
	positions = torch.LongTensor([position_indices])

	# Initialize the embedding layer using the character-level vocabulary size.
	emb = Embeddings(vocab, d_model)
	embr = emb(conversations, roles, positions)

	print("embr:", embr)