Emanuele Mercadante commited on
Commit
1ce3fe8
·
1 Parent(s): 6d08e25

first commit

Browse files
Files changed (4) hide show
  1. Dockerfile +27 -0
  2. model/model_description.txt +0 -0
  3. requirements.txt +3 -0
  4. train_model.py +65 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilizza l'immagine base ufficiale di Hugging Face per PyTorch con supporto CUDA
2
+ FROM huggingface/transformers-pytorch-cuda:latest
3
+
4
+ # Installa le dipendenze di sistema
5
+ RUN apt-get update && apt-get install -y \
6
+ git \
7
+ wget \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Imposta le variabili d'ambiente per il token e l'utente
11
+ ENV HUGGINGFACE_TOKEN=hf_1234567890abcdef1234567890abcdef12345678
12
+ ENV HUGGINGFACE_USER=Rathalos
13
+ ENV HUGGINGFACE_REPO=training_incite
14
+
15
+ # Clona il repository privato
16
+ RUN git clone https://${HUGGINGFACE_TOKEN}@huggingface.co/spaces/${HUGGINGFACE_USER}/${HUGGINGFACE_REPO}.git
17
+
18
+ # Copia il file requirements.txt e installa le dipendenze Python
19
+ COPY requirements.txt .
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copia tutto il contenuto della directory corrente nella directory /app del container
23
+ COPY . /app
24
+ WORKDIR /app
25
+
26
+ # Comando per eseguire lo script di training
27
+ CMD ["python", "train_model.py"]
model/model_description.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
train_model.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset, DataLoader
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
4
+
5
+ class TextDataset(Dataset):
6
+ def __init__(self, text, tokenizer, max_length):
7
+ self.tokenizer = tokenizer
8
+ self.input_ids = self.tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding="max_length").input_ids
9
+
10
+ def __len__(self):
11
+ return self.input_ids.size(1)
12
+
13
+ def __getitem__(self, idx):
14
+ return self.input_ids[:, idx]
15
+
16
+ def main():
17
+ # Hyperparameters
18
+ max_length = 512
19
+ batch_size = 32
20
+ epochs = 3
21
+ learning_rate = 5e-5
22
+
23
+ # File path
24
+ text_file_path = 'path/to/your/text/file.txt' # Modifica questo percorso
25
+
26
+ # Load text data
27
+ with open(text_file_path, 'r', encoding='utf-8') as file:
28
+ text = file.read()
29
+
30
+ # Load tokenizer and model
31
+ model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
33
+ model = AutoModelForCausalLM.from_pretrained(model_name)
34
+
35
+ # Preprocess data
36
+ dataset = TextDataset(text, tokenizer, max_length)
37
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
38
+
39
+ # Setup device
40
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
41
+ model.to(device)
42
+
43
+ # Setup optimizer
44
+ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
45
+
46
+ # Training loop
47
+ for epoch in range(epochs):
48
+ print(f"Epoch {epoch + 1}/{epochs}")
49
+ model.train()
50
+ for batch in dataloader:
51
+ inputs = batch.to(device)
52
+ outputs = model(inputs, labels=inputs)
53
+ loss = outputs.loss
54
+ optimizer.zero_grad()
55
+ loss.backward()
56
+ optimizer.step()
57
+ print(f"Loss: {loss.item()}")
58
+
59
+ # Save the model
60
+ model_save_path = 'model' # Modifica questo percorso
61
+ model.save_pretrained(model_save_path)
62
+ tokenizer.save_pretrained(model_save_path)
63
+
64
+ if __name__ == '__main__':
65
+ main()