HemanM commited on
Commit
b8565ff
·
verified ·
1 Parent(s): 4fec0c7

Create finetune.py

Browse files
Files changed (1) hide show
  1. finetune.py +41 -0
finetune.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+
4
+ # Load model and tokenizer
5
+ model_name = "distilgpt2"
6
+ model = AutoModelForCausalLM.from_pretrained(model_name)
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ tokenizer.pad_token = tokenizer.eos_token
9
+
10
+ # Load dialogue dataset
11
+ dataset = load_dataset("HuggingFaceH4/ultrachat", split="train[:1%]") # Use 1% for demo
12
+
13
+ # Preprocess dataset
14
+ def preprocess(examples):
15
+ prompts = [f"User: {ex['prompt']} Assistant: {ex['response']}" for ex in examples]
16
+ return tokenizer(prompts, truncation=True, padding="max_length", max_length=512)
17
+
18
+ tokenized_dataset = dataset.map(preprocess, batched=True)
19
+
20
+ # Training arguments
21
+ training_args = TrainingArguments(
22
+ output_dir="./evo_finetuned",
23
+ per_device_train_batch_size=4,
24
+ num_train_epochs=3,
25
+ save_steps=1000,
26
+ save_total_limit=2,
27
+ )
28
+
29
+ # Trainer
30
+ trainer = Trainer(
31
+ model=model,
32
+ args=training_args,
33
+ train_dataset=tokenized_dataset,
34
+ )
35
+
36
+ # Fine-tune
37
+ trainer.train()
38
+
39
+ # Save model
40
+ model.save_pretrained("evo_finetuned")
41
+ tokenizer.save_pretrained("evo_finetuned")