import torch import numpy as np import random from transformers import T5Tokenizer, T5ForConditionalGeneration from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split import torch.nn as nn import torch.optim as optim from sklearn.metrics import f1_score device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def set_seed(seed_value=30): """Set seed for reproducibility.""" random.seed(seed_value) # Python random module np.random.seed(seed_value) # Numpy module torch.manual_seed(seed_value) # Torch torch.cuda.manual_seed_all(seed_value) # if you are using multi-GPU. torch.backends.cudnn.deterministic = True # CUDNN determinism torch.backends.cudnn.benchmark = False # Example usage set_seed(30) # Load your dataset data_path = 'final_dataset.csv' # Update this path to where your data is stored in Colab data = pd.read_csv(data_path) # Set up the device for GPU usage device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the model and tokenizer tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') model.to(device) model.eval() # Function to generate summaries def generate_summaries(texts, model, tokenizer, device, max_length=150): summaries = [] for text in texts: encoded_text = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True).to(device) summary_ids = model.generate(encoded_text, max_length=max_length, num_beams=4, early_stopping=True) summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) summaries.append(summary) return summaries # Split the data into chunks to manage memory more effectively (if needed) chunk_size = 10 # Adjust chunk size based on your dataset size and memory constraints num_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size != 0 else 0) all_summaries = [] for i in range(num_chunks): batch = data['Content'][i * chunk_size:(i + 1) * chunk_size] batch_summaries = generate_summaries(batch, model, tokenizer, device) all_summaries.extend(batch_summaries) # Add summaries to the DataFrame data['Summary'] = all_summaries # Save the DataFrame with summaries to a new CSV file output_path = '/content/summarized_data.csv' data.to_csv(output_path, index=False) print(f"Data with summaries saved to {output_path}") class PolicyDataset(Dataset): def __init__(self, data, tokenizer, max_input_length=512, max_target_length=128): self.data = data self.tokenizer = tokenizer self.max_input_length = max_input_length self.max_target_length = max_target_length def __len__(self): return len(self.data) def __getitem__(self, idx): policy_text = self.data.iloc[idx]['Content'] summary_text = self.data.iloc[idx]['Summary'] input_encoding = self.tokenizer.encode_plus( policy_text, max_length=self.max_input_length, padding='max_length', truncation=True, return_tensors='pt' ) target_encoding = self.tokenizer.encode_plus( summary_text, max_length=self.max_target_length, padding='max_length', truncation=True, return_tensors='pt' ) return { 'input_ids': input_encoding['input_ids'].squeeze(), 'attention_mask': input_encoding['attention_mask'].squeeze(), 'labels': target_encoding['input_ids'].squeeze(), 'labels_mask': target_encoding['attention_mask'].squeeze() } data = pd.read_csv('summarized_data.csv') # Ensure this points to your CSV file tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device) # Prepare data splits and loaders train_data, eval_data = train_test_split(data, test_size=0.1, random_state=42) train_dataset = PolicyDataset(train_data, tokenizer) eval_dataset = PolicyDataset(eval_data, tokenizer) train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False) def train(model, train_loader, optimizer, criterion, device): model.train() total_loss = 0 for batch in train_loader: optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) # Labels should be of the shape [batch_size, seq_length] outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) logits = outputs.logits # Output logits are typically [batch_size, seq_length, vocab_size] # Reshape labels to match the output logits dimensions if needed # labels should be [batch_size * seq_length] when passed to CrossEntropyLoss loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(train_loader) def evaluate(model, eval_loader, criterion, device): model.eval() total_loss = 0 all_predictions = [] all_labels = [] with torch.no_grad(): for batch in eval_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) logits = outputs.logits # Calculate loss loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) total_loss += loss.item() # Calculate F1 score predictions = torch.argmax(logits, dim=-1).flatten().cpu().numpy() labels_flat = labels.flatten().cpu().numpy() valid_indices = labels_flat != -100 valid_predictions = predictions[valid_indices] valid_labels = labels_flat[valid_indices] all_predictions.extend(valid_predictions) all_labels.extend(valid_labels) f1 = f1_score(all_labels, all_predictions, average='macro') return total_loss / len(eval_loader), f1 optimizer = optim.AdamW(model.parameters(), lr=5e-5) criterion = nn.CrossEntropyLoss() # Training loop for epoch in range(5): # Adjust the number of epochs as needed train_loss = train(model, train_loader, optimizer, criterion, device) eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device) print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Eval Loss = {eval_loss:.4f}, Eval F1 = {eval_f1:.4f}") # Function to run training def run_training(lr, batch_size, number_of_epochs=5): model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device) model.train() train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) optimizer = optim.AdamW(model.parameters(), lr=lr) criterion = torch.nn.CrossEntropyLoss() # Training loop for epoch in range(number_of_epochs): train_loss = train(model, train_loader, optimizer, criterion, device) eval_loss, eval_f1 = evaluate(model, eval_loader, criterion, device) print(f"LR: {lr}, Batch size: {batch_size}, Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}, Eval F1: {eval_f1:.4f}") # Define hyperparameters to test learning_rates = [1e-5, 3e-5, 5e-5] batch_sizes = [16, 32, 64] # Run grid search for lr in learning_rates: for batch_size in batch_sizes: run_training(lr, batch_size, number_of_epochs=5) # Specify the number of epochs here