Spaces:
Build error
Build error
File size: 2,042 Bytes
1ce3fe8 130c974 1ce3fe8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
class TextDataset(Dataset):
def __init__(self, text, tokenizer, max_length):
self.tokenizer = tokenizer
self.input_ids = self.tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding="max_length").input_ids
def __len__(self):
return self.input_ids.size(1)
def __getitem__(self, idx):
return self.input_ids[:, idx]
def main():
# Hyperparameters
max_length = 512
batch_size = 32
epochs = 3
learning_rate = 5e-5
# File path
text_file_path = 'text_file.txt' # Modifica questo percorso
# Load text data
with open(text_file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Load tokenizer and model
model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Preprocess data
dataset = TextDataset(text, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Setup optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(epochs):
print(f"Epoch {epoch + 1}/{epochs}")
model.train()
for batch in dataloader:
inputs = batch.to(device)
outputs = model(inputs, labels=inputs)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Loss: {loss.item()}")
# Save the model
model_save_path = 'model' # Modifica questo percorso
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
if __name__ == '__main__':
main()
|