Spaces:
Runtime error
Runtime error
File size: 4,412 Bytes
0901f7b 9d42f47 0681bc6 40bc3ae 1e660fe a208e00 99b2dac 691657e 30eb466 d275124 64226c2 187bb52 171817a 74e821f ea2c6ba 336121a 5c300b0 8e6d41d 08eb467 168d045 df20fa3 b62e2d9 fae8953 15505de 5686d0a d50b787 87281da 4acf537 027f3d0 a2f8e94 2ad8ac8 6a7186e 55f1dff 53082d2 22ddb9e 9fc882b da1164c f030196 05f01c9 0b6db0c 453434f f9e196a 68248ca ae4f8b6 9ed09a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import time
import torch
import numpy as np
import pandas as pd
from datasets import load_metric
from transformers import (
AdamW,
T5ForConditionalGeneration,
T5TokenizerFast as T5Tokenizer,
)
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning import LightningDataModule
from pytorch_lightning import LightningModule
torch.cuda.empty_cache()
pl.seed_everything(42)
class DataModule(Dataset):
"""
Data Module for pytorch
"""
def __init__(
self,
data: pd.DataFrame,
tokenizer: T5Tokenizer,
source_max_token_len: int = 512,
target_max_token_len: int = 512,
):
"""
:param data:
:param tokenizer:
:param source_max_token_len:
:param target_max_token_len:
"""
self.data = data
self.target_max_token_len = target_max_token_len
self.source_max_token_len = source_max_token_len
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, index: int):
data_row = self.data.iloc[index]
input_encoding = self.tokenizer(
data_row["input_text"],
max_length=self.source_max_token_len,
padding="max_length",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
output_encoding = self.tokenizer(
data_row["output_text"],
max_length=self.target_max_token_len,
padding="max_length",
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors="pt",
)
labels = output_encoding["input_ids"]
labels[
labels == 0
] = -100
return dict(
keywords=data_row["keywords"],
text=data_row["text"],
keywords_input_ids=input_encoding["input_ids"].flatten(),
keywords_attention_mask=input_encoding["attention_mask"].flatten(),
labels=labels.flatten(),
labels_attention_mask=output_encoding["attention_mask"].flatten(),
)
class PLDataModule(LightningDataModule):
def __init__(
self,
train_df: pd.DataFrame,
test_df: pd.DataFrame,
tokenizer: T5Tokenizer,
source_max_token_len: int = 512,
target_max_token_len: int = 512,
batch_size: int = 4,
split: float = 0.1
):
"""
:param data_df:
:param tokenizer:
:param source_max_token_len:
:param target_max_token_len:
:param batch_size:
:param split:
"""
super().__init__()
self.train_df = train_df
self.test_df = test_df
self.split = split
self.batch_size = batch_size
self.target_max_token_len = target_max_token_len
self.source_max_token_len = source_max_token_len
self.tokenizer = tokenizer
def setup(self, stage=None):
self.train_dataset = DataModule(
self.train_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len,
)
self.test_dataset = DataModule(
self.test_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len,
)
def train_dataloader(self):
""" training dataloader """
return DataLoader(
self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2
)
def test_dataloader(self):
""" test dataloader """
return DataLoader(
self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
)
def val_dataloader(self):
""" validation dataloader """
return DataLoader(
self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2
)
class LightningModel(LightningModule):
""" PyTorch Lightning Model class"""
def __init__(self, tokenizer, model, output: str = "outputs"):
"""
|