Spaces:
Sleeping
Sleeping
File size: 1,630 Bytes
33d4721 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
class Seq2SeqDataset:
"""
A dataset class for sequence-to-sequence tasks.
Args:
data (list): The dataset containing input and target sequences.
tokenizer (PreTrainedTokenizer): The tokenizer to process the text data.
config (object): Configuration object containing dataset parameters.
Attributes:
data (list): The dataset containing input and target sequences.
tokenizer (PreTrainedTokenizer): The tokenizer to process the text data.
config (object): Configuration object containing dataset parameters.
max_len_input (int): Maximum length for input sequences.
max_len_target (int): Maximum length for target sequences.
Methods:
__len__(): Returns the number of samples in the dataset.
__getitem__(item): Returns the tokenized input and target sequences for a given index.
"""
def __init__(self, data, tokenizer, config):
self.data = data
self.tokenizer = tokenizer
self.config = config
self.max_len_input = self.config.max_seq_length
self.max_len_target = self.config.max_target_length
def __len__(self):
return len(self.data)
def __getitem__(self, item):
text = str(self.data[item][self.config.text_column])
target = str(self.data[item][self.config.target_column])
model_inputs = self.tokenizer(text, max_length=self.max_len_input, truncation=True)
labels = self.tokenizer(text_target=target, max_length=self.max_len_target, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
|