import torch from torch.utils.data import Dataset, DataLoader from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments class TextDataset(Dataset): def __init__(self, text, tokenizer, max_length): self.tokenizer = tokenizer self.input_ids = self.tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding="max_length").input_ids def __len__(self): return self.input_ids.size(1) def __getitem__(self, idx): return self.input_ids[:, idx] def main(): # Hyperparameters max_length = 512 batch_size = 32 epochs = 3 learning_rate = 5e-5 # File path text_file_path = 'text_file.txt' # Modifica questo percorso # Load text data with open(text_file_path, 'r', encoding='utf-8') as file: text = file.read() # Load tokenizer and model model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Preprocess data dataset = TextDataset(text, tokenizer, max_length) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # Setup device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Setup optimizer optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # Training loop for epoch in range(epochs): print(f"Epoch {epoch + 1}/{epochs}") model.train() for batch in dataloader: inputs = batch.to(device) outputs = model(inputs, labels=inputs) loss = outputs.loss optimizer.zero_grad() loss.backward() optimizer.step() print(f"Loss: {loss.item()}") # Save the model model_save_path = 'model' # Modifica questo percorso model.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) if __name__ == '__main__': main()